2 年之前 · c0b848b691
--- a/Crawlb/docker-compose.yml
+++ b/Crawlb/docker-compose.yml
@@ -1,47 +0,0 @@
 
															-version: '3.3'
														
 
															-services:
														
 
															-  master:
														
 
															-    image: swordfish:v1
														
 
															-    container_name: master_new
														
 
															-    environment:
														
 
															-       CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址，在 docker compose 网络中，直接引用服务名称
														
 
															-       CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
														
 
															-       CRAWLAB_SERVER_MASTER: "Y"
														
 
															-       CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
														
 
															-       CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
														
 
															-       CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
														
 
															-       CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
														
 
															-       CRAWLAB_REDIS_ADDRESS: "redis"  #
														
 
															-#       CRAWLAB_REDIS_ADDRESS: "172.19.0.2"  # Redis host address Redis 的地址，在 docker compose 网络中，直接引用服务名称
														
 
															-       CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
														
 
															-       CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
														
 
															-       CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
														
 
															-       CRAWLAB_SERVER_REGISTER_TYPE: "mac"
														
 
															-    volumes: # 目录挂载，宿主机在前，容器在后
														
 
															-      - /mnt/magp:/magp
														
 
															-    ports:
														
 
															-        - "8998:8080"
														
 
															-
														
 
															-
														
 
															-#    depends_on:
														
 
															-#          - redis
														
 
															-
														
 
															-#    deploy:
														
 
															-#      resources:
														
 
															-#        limits:
														
 
															-#          memory: 15G
														
 
															-#        reservations:
														
 
															-#          memory: 1G
														
 
															-
														
 
															-#  mongo:
														
 
															-#    image: mongo:latest
														
 
															-#    restart: always
														
 
															-#    ports:
														
 
															-#      - "27027:27017"
														
 
															-#  redis:
														
 
															-#    image: redis:latest
														
 
															-#    container_name: master_redis
														
 
															-#    restart: always
														
 
															-#    ports:
														
 
															-#      - "6379:6379"
														
 
															-#  wget http://download.firefox.com.cn/releases/firefox/78.14/zh-CN/Firefox-latest-x86_64.tar.bz2
														
--- a/Crawlb/docker-compose_work.yml
+++ b/Crawlb/docker-compose_work.yml
@@ -1,54 +0,0 @@
 
															-version: '3.3'
														
 
															-services:
														
 
															-  worker01:
														
 
															-    image: swordfish:v1
														
 
															-    container_name: crawlab_worker01
														
 
															-    environment:
														
 
															-      CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址，在 docker compose 网络中，直接引用服务名称
														
 
															-      CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
														
 
															-      CRAWLAB_SERVER_MASTER: "N"
														
 
															-      CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
														
 
															-      CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
														
 
															-      CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
														
 
															-      CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
														
 
															-      CRAWLAB_REDIS_ADDRESS: "redis"  # Redis host address Redis 的地址，在 docker compose 网络中，直接引用服务名称
														
 
															-      CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
														
 
															-      CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
														
 
															-      CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
														
 
															-      CRAWLAB_SERVER_REGISTER_TYPE: "ip"
														
 
															-      CRAWLAB_FS_FILER_URL: "http://101.200.210.94:8998/api/filer"
														
 
															-
														
 
															-  worker02:
														
 
															-    image: swordfish:v1
														
 
															-    container_name: crawlab_worker02
														
 
															-    environment:
														
 
															-      CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址，在 docker compose 网络中，直接引用服务名称
														
 
															-      CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
														
 
															-      CRAWLAB_SERVER_MASTER: "N"
														
 
															-      CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
														
 
															-      CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
														
 
															-      CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
														
 
															-      CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
														
 
															-      CRAWLAB_REDIS_ADDRESS: "redis"  # Redis host address Redis 的地址，在 docker compose 网络中，直接引用服务名称
														
 
															-      CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
														
 
															-      CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
														
 
															-      CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
														
 
															-      CRAWLAB_SERVER_REGISTER_TYPE: "ip"
														
 
															-      CRAWLAB_FS_FILER_URL: "http://101.200.210.94:8998/api/filer"
														
 
															-  worker03:
														
 
															-    image: swordfish:v1
														
 
															-    container_name: crawlab_worker03
														
 
															-    environment:
														
 
															-      CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址，在 docker compose 网络中，直接引用服务名称
														
 
															-      CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
														
 
															-      CRAWLAB_SERVER_MASTER: "N"
														
 
															-      CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
														
 
															-      CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
														
 
															-      CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
														
 
															-      CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
														
 
															-      CRAWLAB_REDIS_ADDRESS: "redis"  # Redis host address Redis 的地址，在 docker compose 网络中，直接引用服务名称
														
 
															-      CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
														
 
															-      CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
														
 
															-      CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
														
 
															-      CRAWLAB_SERVER_REGISTER_TYPE: "ip"
														
 
															-      CRAWLAB_FS_FILER_URL: "http://101.200.210.94:8998/api/filer"
														
--- a/Details/detail_cookie.py
+++ b/Details/detail_cookie.py
@@ -1,170 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:  生成一定有效期cookie，并使用的detail 详情处理方案，默认不限制ip
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-import execjs
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-
														
 
															-from untils.cookie_pool import PageCookiePool
														
 
															-import copy
														
 
															-
														
 
															-class Details(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
														
 
															-            for item in data_lsit:
														
 
															-                request_params = item.get("request_params")
														
 
															-                down_mid = copy.copy(item.get("down_mid"))
														
 
															-                key = down_mid.get("key")
														
 
															-                page_url = down_mid.get("page_url")
														
 
															-                cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-                down_mid["cookie_pool"] = cookie_pool
														
 
															-
														
 
															-                if item.get("ex_python"):
														
 
															-                    exec(item.get("ex_python"))
														
 
															-
														
 
															-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
														
 
															-                                      deal_detail=item.get("deal_detail"),**request_params,
														
 
															-                                      callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
														
 
															-                self.to_db.delete(self.db_name,item)
														
 
															-            break
														
 
															-
														
 
															-
														
 
															-
														
 
															-    def detail_get(self,request,response):
														
 
															-        '''处理html格式的返回结果'''
														
 
															-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
														
 
															-            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        if response.code in (request.down_mid.get("code")):
														
 
															-            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        for xpath in request.deal_detail:
														
 
															-            html = response.xpath(xpath).extract_first()  # 标书详细内容
														
 
															-            if html is not None:
														
 
															-                break
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        yield list_item
														
 
															-
														
 
															-    def detail_json(self,request,response):
														
 
															-        '''处理json串及其他格式的返回结果'''
														
 
															-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
														
 
															-            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        if response.code in (request.down_mid.get("code")):
														
 
															-            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        exec(request.deal_detail)
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        if response is None:
														
 
															-            code = 0
														
 
															-        else:
														
 
															-            code = response.status_code
														
 
															-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
														
 
															-        if 200 <= code < 300:
														
 
															-            err = 'analysis'
														
 
															-        elif 300 <= code < 400:
														
 
															-            err = 'download'
														
 
															-        elif 400 <= code < 500:
														
 
															-            err = 'download'
														
 
															-        elif 500 <= code:
														
 
															-            err = "servers"
														
 
															-        else:
														
 
															-            err = "timeout"
														
 
															-        mgp = MgpListItem()
														
 
															-        mgp.code = code
														
 
															-        mgp.error = err
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key, items[key])
														
 
															-        mgp.failed += 1
														
 
															-        if mgp.pri is None:
														
 
															-            mgp.pri = 0
														
 
															-
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info = f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **爬虫等级:** {mgp.pri}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-    def download_midware(self, request):
														
 
															-        down_mid = request.down_mid
														
 
															-        key = down_mid.get("key")
														
 
															-        page_url = down_mid.get("page_url")
														
 
															-        cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-        request.cookies = cookie_pool.get_cookie()
														
 
															-        return request
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Details(redis_key="magp:details:cookie").start()
														
--- a/Details/detail_firefox.py
+++ b/Details/detail_firefox.py
@@ -1,117 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-import execjs
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-
														
 
															-
														
 
															-
														
 
															-class FirefoxDetails(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_firefox"},sort={"date":-1})
														
 
															-            print(data_lsit)
														
 
															-            for item in data_lsit:
														
 
															-                print(item)
														
 
															-                request_params = item.get("request_params")
														
 
															-                if item.get("ex_python"):
														
 
															-                    exec(item.get("ex_python"))
														
 
															-
														
 
															-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
														
 
															-                                      deal_detail=item.get("deal_detail"),**request_params,
														
 
															-                                      callback=eval(item.get("parse")),base_info=item,render=True,
														
 
															-                                      render_time=item.get("render_time"))
														
 
															-                self.to_db.delete(self.db_name,item)
														
 
															-            break
														
 
															-
														
 
															-    def detail_get(self,request,response):
														
 
															-        print(response.text)
														
 
															-        items = request.item
														
 
															-        # print(items)
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        for xpath in request.deal_detail:
														
 
															-            html = response.xpath(xpath).extract_first()  # 标书详细内容
														
 
															-            if html is not None:
														
 
															-                break
														
 
															-        list_item.contenthtml = html
														
 
															-        yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        if response is None:
														
 
															-            code = 0
														
 
															-        else:
														
 
															-        	code = response.status_code
														
 
															-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
														
 
															-        if 200 <= code < 300:
														
 
															-            err = 'analysis'
														
 
															-        elif 300 <= code < 400:
														
 
															-            err = 'download'
														
 
															-        elif 400 <= code < 500:
														
 
															-            err = 'download'
														
 
															-        elif 500 <= code:
														
 
															-            err = "servers"
														
 
															-        else:
														
 
															-            err = "timeout"
														
 
															-        mgp = MgpListItem()
														
 
															-        mgp.code = code
														
 
															-        mgp.error = err
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key, items[key])
														
 
															-        mgp.failed += 1
														
 
															-        if mgp.pri is None:
														
 
															-            mgp.pri = 0
														
 
															-
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info = f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **爬虫等级:** {mgp.pri}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
														
 
															-    # def download_midware(self, request):
														
 
															-    #     request.proxies = self.prox_pool.get()
														
 
															-    #     return request
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    FirefoxDetails(redis_key="magp:details:firefox").start()
														
--- a/Details/details.py
+++ b/Details/details.py
@@ -1,164 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import time
														
 
															-from urllib.parse import urljoin
														
 
															-
														
 
															-import feapder
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-import execjs
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-from untils.attachment import AttachmentDownloader
														
 
															-
														
 
															-
														
 
															-class Details(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details","failed":0},sort={"failed":1},limit=100)
														
 
															-            for item in data_lsit:
														
 
															-                print(item.get("item"))
														
 
															-                request_params = item.get("request_params")
														
 
															-                if item.get("js"):
														
 
															-                    eval(item.get("js"))
														
 
															-                if item.get("ex_python"):
														
 
															-                    exec(item.get("ex_python"))
														
 
															-                if item.get("proxies"):
														
 
															-
														
 
															-                    yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files_info=item.get("files"),
														
 
															-                                          deal_detail=item.get("deal_detail"),
														
 
															-                                          callback=eval(item.get("parse")),base_info=item,**request_params)
														
 
															-                else:
														
 
															-                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files_info=item.get("files"),
														
 
															-                                          deal_detail=item.get("deal_detail"),
														
 
															-                                          callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
														
 
															-
														
 
															-                self.to_db.delete(self.db_name,item)
														
 
															-            break
														
 
															-
														
 
															-    def detail_get(self,request,response):
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        for xpath in request.deal_detail:
														
 
															-            html = response.xpath(xpath).extract_first()  # 标书详细内容
														
 
															-            if html is not None:
														
 
															-                break
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        if request.files_info:
														
 
															-            files_info = request.files_info
														
 
															-            files =  response.xpath(files_info.get("list_xpath"))
														
 
															-            if len(files)>0:
														
 
															-                attachments = {}
														
 
															-                for index,info in enumerate(files):
														
 
															-                    file_url = info.xpath(files_info.get("url_xpath")).extract_first()
														
 
															-                    file_name = info.xpath(files_info.get("name_xpath")).extract_first()
														
 
															-                    if files_info.get("host"):
														
 
															-                        file_url = urljoin(files_info.get("host"), file_url)
														
 
															-                    if not files_info.get("file_type"):
														
 
															-                        file_type = file_url.split("?")[0].split(".")[-1].lower()
														
 
															-                    elif files_info.get("file_type")=='file_name':
														
 
															-                        file_type = file_name.split("?")[0].split(".")[-1].lower()
														
 
															-                    else:
														
 
															-                        file_type = files_info.get("file_type")
														
 
															-                    if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
														
 
															-                        attachment = AttachmentDownloader().fetch_attachment(
														
 
															-                            file_name=file_name,file_type=file_type,download_url=file_url,
														
 
															-                            enable_proxy=False)
														
 
															-                        attachments[str(len(attachments)+1)] = attachment
														
 
															-                if len(attachments)==0:
														
 
															-                    pass
														
 
															-                else:
														
 
															-                    list_item.projectinfo={"attachments":attachments}
														
 
															-
														
 
															-        yield list_item
														
 
															-
														
 
															-    def detail_json(self,request,response):
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        exec(request.deal_detail)
														
 
															-
														
 
															-        yield list_item
														
 
															-    def detail_post(self,request,response):
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        exec(request.deal_detail)
														
 
															-
														
 
															-        yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        if response is None:
														
 
															-            code = 0
														
 
															-        else:
														
 
															-        	code = response.status_code
														
 
															-        err_dic = {"200":"analysis","400":"download","500":"servers","300":"download"}
														
 
															-        if 200<=code<300:
														
 
															-            err = 'analysis'
														
 
															-        elif 300<=code<400:
														
 
															-            err = 'download'
														
 
															-        elif 400<=code<500:
														
 
															-            err = 'download'
														
 
															-        elif 500<=code:
														
 
															-            err = "servers"
														
 
															-        else:
														
 
															-            err = "timeout"
														
 
															-        mgp = MgpListItem()
														
 
															-        mgp.code=code
														
 
															-        mgp.error=err
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key,items[key])
														
 
															-        mgp.failed +=1
														
 
															-        if mgp.pri is None:
														
 
															-            mgp.pri = 0
														
 
															-
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info= f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **爬虫等级:** {mgp.pri}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Details(redis_key="magp:details1").start()
														
--- a/Details/details_webcookie.py
+++ b/Details/details_webcookie.py
@@ -1,200 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:  生成一定有效期cookie，并使用的detail 详情处理方案，默认不限制ip
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import sys
														
 
															-from urllib.parse import urljoin
														
 
															-
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-import execjs
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-from untils.attachment import AttachmentDownloader
														
 
															-from untils.WebCookiePool import WebCookiePool
														
 
															-import copy
														
 
															-
														
 
															-class Details(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_webcookie"},sort={"date":-1},limit=100)
														
 
															-            for item in data_lsit:
														
 
															-                request_params = item.get("request_params")
														
 
															-
														
 
															-                if item.get("ex_python"):
														
 
															-                    exec(item.get("ex_python"))
														
 
															-                if item.get("proxies"):
														
 
															-
														
 
															-                    yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files_info=item.get("files"),
														
 
															-                                          deal_detail=item.get("deal_detail"),down_mid=item.get("down_mid"),
														
 
															-                                          callback=eval(item.get("parse")),base_info=item,**request_params)
														
 
															-                else:
														
 
															-                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files_info=item.get("files"),
														
 
															-                                          deal_detail=item.get("deal_detail"),down_mid=item.get("down_mid"),
														
 
															-                                          callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
														
 
															-
														
 
															-                self.to_db.delete(self.db_name,item)
														
 
															-            break
														
 
															-
														
 
															-
														
 
															-
														
 
															-    def detail_get(self,request,response):
														
 
															-        '''处理html格式的返回结果'''
														
 
															-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
														
 
															-            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_key = down_mid.get("cookie_key")
														
 
															-            cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        elif response.code in (request.down_mid.get("code")):
														
 
															-            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_key = down_mid.get("cookie_key")
														
 
															-            cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        for xpath in request.deal_detail:
														
 
															-            html = response.xpath(xpath).extract_first()  # 标书详细内容
														
 
															-            if html is not None:
														
 
															-                break
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        if request.files_info:
														
 
															-            files_info = request.files_info
														
 
															-            files =  response.xpath(files_info.get("list_xpath"))
														
 
															-            if len(files)>0:
														
 
															-                attachments = {}
														
 
															-                for index,info in enumerate(files):
														
 
															-                    file_url = info.xpath(files_info.get("url_xpath")).extract_first()
														
 
															-                    file_name = info.xpath(files_info.get("name_xpath")).extract_first()
														
 
															-                    if files_info.get("host"):
														
 
															-                        file_url = urljoin(files_info.get("host"), file_url)
														
 
															-                    if not files_info.get("file_type"):
														
 
															-                        file_type = file_url.split("?")[0].split(".")[-1].lower()
														
 
															-                    else:
														
 
															-                        file_type = files_info.get("file_type")
														
 
															-                    if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
														
 
															-                        attachment = AttachmentDownloader().fetch_attachment(
														
 
															-                            file_name=file_name,file_type=file_type,download_url=file_url,
														
 
															-                            enable_proxy=False)
														
 
															-                        attachments[str(len(attachments)+1)] = attachment
														
 
															-                if len(attachments)==0:
														
 
															-                    pass
														
 
															-                else:
														
 
															-                    list_item.projectinfo={"attachments":attachments}
														
 
															-        yield list_item
														
 
															-
														
 
															-    def detail_json(self,request,response):
														
 
															-        '''处理json串及其他格式的返回结果'''
														
 
															-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
														
 
															-            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            cookie_key = down_mid.get("cookie_key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        elif response.status_code in request.down_mid.get("code"):
														
 
															-            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_key = down_mid.get("cookie_key")
														
 
															-            cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        else:
														
 
															-            items = request.item
														
 
															-            list_item = DataBakItem()
														
 
															-            for key in items:
														
 
															-                list_item.__setitem__(key,items[key])
														
 
															-            exec(request.deal_detail)
														
 
															-            yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        if response is None:
														
 
															-            code = 0
														
 
															-        else:
														
 
															-            code = response.status_code
														
 
															-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
														
 
															-        if 200 <= code < 300:
														
 
															-            err = 'analysis'
														
 
															-        elif 300 <= code < 400:
														
 
															-            err = 'download'
														
 
															-        elif 400 <= code < 500:
														
 
															-            err = 'download'
														
 
															-        elif 500 <= code:
														
 
															-            err = "servers"
														
 
															-        else:
														
 
															-            err = "timeout"
														
 
															-        mgp = MgpListItem()
														
 
															-        mgp.code = code
														
 
															-        mgp.error = err
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key, items[key])
														
 
															-        mgp.failed += 1
														
 
															-        if mgp.pri is None:
														
 
															-            mgp.pri = 0
														
 
															-
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info = f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **爬虫等级:** {mgp.pri}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-    def download_midware(self, request):
														
 
															-        down_mid = request.down_mid
														
 
															-        key = down_mid.get("key")
														
 
															-        page_url = down_mid.get("page_url")
														
 
															-        cookie_key = down_mid.get("cookie_key")
														
 
															-        request.headers={"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"}
														
 
															-        cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key)
														
 
															-        request.cookies = cookie_pool.get_cookie()
														
 
															-        return request
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Details(redis_key="magp:details_webcookie").start()
														
--- a/FworkSpider/details/__init__.py
+++ b/FworkSpider/details/__init__.py
@@ -1,15 +0,0 @@
 
															-import requests
														
 
															-
														
 
															-
														
 
															-headers = {
														
 
															-
														
 
															-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
														
 
															-}
														
 
															-cookies = {
														
 
															-    "__jsluid_h": "018c23a4fee58c26aa118512640f8022"
														
 
															-}
														
 
															-url = "http://www.snszgh.gov.cn/gsgg/index.html"
														
 
															-response = requests.get(url, headers=headers,verify=False)
														
 
															-
														
 
															-print(response.text)
														
 
															-print(response)
														
--- a/FworkSpider/details/detail_ztlbw.py
+++ b/FworkSpider/details/detail_ztlbw.py
@@ -1,134 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-
														
 
															-import feapder
														
 
															-from feapder.utils.log import Log
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-from login_pool.zglbw import ZglbwPool
														
 
															-from untils.attachment import AttachmentDownloader
														
 
															-
														
 
															-Log().info("")
														
 
															-
														
 
															-
														
 
															-class FirefoxDetails(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name, {"parser_name": "details_ztlbw", "item.spidercode": "a_ztlbsww_jzxtp"},
														
 
															-                                        sort={"date": -1}, limit=1)
														
 
															-            print(data_lsit)
														
 
															-            for item in data_lsit:
														
 
															-                url = item.get("parse_url")
														
 
															-                url = "https://eproport.crecgec.com/#/notice/notice-detail?projectId=1484412339522916354&tenantId=1&indexnumber=0"
														
 
															-                cookie = ZglbwPool(table_userbase='zglbw', redis_key='zglbw')
														
 
															-                cookie = cookie.get_cookie().cookie
														
 
															-                yield feapder.Request(url=url, item=item.get("item"),
														
 
															-                                      callback=self.detail_get, base_info=item, render=True,
														
 
															-                                      render_time=3, proxies=False, cookies=cookie)
														
 
															-                self.to_db.delete(self.db_name, item)
														
 
															-            break
														
 
															-
														
 
															-    def detail_get(self, request, response):
														
 
															-        items = request.item
														
 
															-        # print(items)
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key, items[key])
														
 
															-        html = ''
														
 
															-        xpath_list = ['//div[@class="ant-col ant-col-xs-6 ant-col-sm-6 ant-col-lg-12"][1]',
														
 
															-                      '//div[@class="luban-bid-details ant-row ng-star-inserted"][2]',
														
 
															-                      '//div[@class="login ng-star-inserted"]']
														
 
															-        for xpath in xpath_list:
														
 
															-            # import pdb
														
 
															-            # pdb.set_trace()
														
 
															-            html_one = response.xpath(xpath).extract_first()
														
 
															-            if html_one is not None:
														
 
															-                html += '\n'  # 标书详细内容
														
 
															-                html += html_one  # 拼接html
														
 
															-        print(html)
														
 
															-        list_item.contenthtml = html
														
 
															-        files_list = response.xpath("//iframe/@src").extract_first()
														
 
															-        file_url = files_list.split("file=")[-1]
														
 
															-        file_url = file_url.replace("%3A", ":").replace("%2F", "/").replace("%3F", "?").replace("%3D", "=")
														
 
															-        attachments = {}
														
 
															-        file_name = list_item.title
														
 
															-
														
 
															-        attachment = AttachmentDownloader().fetch_attachment(
														
 
															-            file_name=file_name, file_type='pdf', download_url=file_url,
														
 
															-            enable_proxy=False)
														
 
															-        attachments["0"] = attachment
														
 
															-        list_item.projectinfo = {"attachments": attachments}
														
 
															-        yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        if response is None:
														
 
															-            code = 0
														
 
															-        code = response.status_code
														
 
															-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
														
 
															-        if 200 <= code < 300:
														
 
															-            err = 'analysis'
														
 
															-        elif 300 <= code < 400:
														
 
															-            err = 'download'
														
 
															-        elif 400 <= code < 500:
														
 
															-            err = 'download'
														
 
															-        elif 500 <= code:
														
 
															-            err = "servers"
														
 
															-        else:
														
 
															-            err = "timeout"
														
 
															-        mgp = MgpListItem()
														
 
															-        mgp.code = code
														
 
															-        mgp.error = err
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key, items[key])
														
 
															-        mgp.failed += 1
														
 
															-        if mgp.pri is None:
														
 
															-            mgp.pri = 0
														
 
															-
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info = f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **爬虫等级:** {mgp.pri}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
														
 
															-    # def download_midware(self, request):
														
 
															-    #     request.proxies = self.prox_pool.get()
														
 
															-    #     return request
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    FirefoxDetails(redis_key="magp:details:ztlbw").start()
														
--- a/FworkSpider/details/details.py
+++ b/FworkSpider/details/details.py
@@ -1,170 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import json
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import time
														
 
															-from urllib.parse import urljoin
														
 
															-
														
 
															-import feapder
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-import execjs
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-from untils.attachment import AttachmentDownloader
														
 
															-
														
 
															-
														
 
															-class Details(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details"},sort={"item.publishtime":-1},limit=50)
														
 
															-            for item in data_lsit:
														
 
															-                print(11111)
														
 
															-                request_params = item.get("request_params")
														
 
															-                if item.get("js"):
														
 
															-                    eval(item.get("js"))
														
 
															-                if item.get("ex_python"):
														
 
															-                    exec(item.get("ex_python"))
														
 
															-                if item.get("proxies"):
														
 
															-
														
 
															-                    yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files=item.get("files"),
														
 
															-                                          deal_detail=item.get("deal_detail"),
														
 
															-                                          callback=eval(item.get("parse")),base_info=item,**request_params)
														
 
															-                else:
														
 
															-                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files=item.get("files"),
														
 
															-                                          deal_detail=item.get("deal_detail"),
														
 
															-                                          callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
														
 
															-
														
 
															-                self.to_db.delete(self.db_name,item)
														
 
															-            break
														
 
															-
														
 
															-    def detail_get(self,request,response):
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        for xpath in request.deal_detail:
														
 
															-            html = response.xpath(xpath).extract_first()  # 标书详细内容
														
 
															-            if html is not None:
														
 
															-                break
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        if request.files:
														
 
															-            files_info = request.files
														
 
															-            files =  response.xpath(files_info.get("list_xpath"))
														
 
															-            if request.files_info:
														
 
															-                files_info = request.files_info
														
 
															-                files = response.xpath(files_info.get("list_xpath"))
														
 
															-                if request.files_info:
														
 
															-                    files_info = request.files_info
														
 
															-                    files = response.xpath(files_info.get("list_xpath"))
														
 
															-                    if len(files) > 0:
														
 
															-                        attachments = {}
														
 
															-                        for index, info in enumerate(files):
														
 
															-                            file_url = info.xpath(files_info.get("url_xpath")).extract_first()
														
 
															-                            file_name = info.xpath(files_info.get("name_xpath")).extract_first()
														
 
															-                            if files_info.get("host"):
														
 
															-                                file_url = urljoin(files_info.get("host"), file_url)
														
 
															-                            if not files_info.get("file_type"):
														
 
															-                                file_type = file_url.split("?")[0].split(".")[-1].lower()
														
 
															-                            else:
														
 
															-                                file_type = files_info.get("file_type")
														
 
															-                            if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
														
 
															-                                attachment = AttachmentDownloader().fetch_attachment(
														
 
															-                                    file_name=file_name, file_type=file_type, download_url=file_url,
														
 
															-                                    enable_proxy=False)
														
 
															-                                attachments[len(attachments) + 1] = attachment
														
 
															-                        if len(attachments) == 0:
														
 
															-                            pass
														
 
															-                        else:
														
 
															-                            list_item.projectinfo = {"attachment": attachments}
														
 
															-
														
 
															-
														
 
															-        yield list_item
														
 
															-
														
 
															-    def detail_json(self,request,response):
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        exec(request.deal_detail)
														
 
															-
														
 
															-        yield list_item
														
 
															-    def detail_post(self,request,response):
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        exec(request.deal_detail)
														
 
															-
														
 
															-        yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        if response is None:
														
 
															-            code = 0
														
 
															-        else:
														
 
															-            code = response.status_code
														
 
															-        err_dic = {"200":"analysis","400":"download","500":"servers","300":"download"}
														
 
															-        if 200<=code<300:
														
 
															-            err = 'analysis'
														
 
															-        elif 300<=code<400:
														
 
															-            err = 'download'
														
 
															-        elif 400<=code<500:
														
 
															-            err = 'download'
														
 
															-        elif 500<=code:
														
 
															-            err = "servers"
														
 
															-        else:
														
 
															-            err = "timeout"
														
 
															-        mgp = MgpListItem()
														
 
															-        mgp.code=code
														
 
															-        mgp.error=err
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key,items[key])
														
 
															-        mgp.failed +=1
														
 
															-        if mgp.pri is None:
														
 
															-            mgp.pri = 0
														
 
															-
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info= f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **爬虫等级:** {mgp.pri}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Details(redis_key="magp:details1").start()
														
--- a/FworkSpider/details/details_cookie.py
+++ b/FworkSpider/details/details_cookie.py
@@ -1,165 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:  生成一定有效期cookie，并使用的detail 详情处理方案，默认不限制ip
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-import execjs
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-
														
 
															-from untils.cookie_pool import PageCookiePool
														
 
															-import copy
														
 
															-
														
 
															-class Details(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
														
 
															-            for item in data_lsit:
														
 
															-                request_params = item.get("request_params")
														
 
															-
														
 
															-                if item.get("ex_python"):
														
 
															-                    exec(item.get("ex_python"))
														
 
															-
														
 
															-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
														
 
															-                                      deal_detail=item.get("deal_detail"),**request_params,
														
 
															-                                      callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
														
 
															-                self.to_db.delete(self.db_name,item)
														
 
															-            break
														
 
															-
														
 
															-
														
 
															-
														
 
															-    def detail_get(self,request,response):
														
 
															-        '''处理html格式的返回结果'''
														
 
															-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
														
 
															-            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        if response.code in (request.down_mid.get("code")):
														
 
															-            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        for xpath in request.deal_detail:
														
 
															-            html = response.xpath(xpath).extract_first()  # 标书详细内容
														
 
															-            if html is not None:
														
 
															-                break
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        yield list_item
														
 
															-
														
 
															-    def detail_json(self,request,response):
														
 
															-        '''处理json串及其他格式的返回结果'''
														
 
															-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
														
 
															-            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        if response.code in (request.down_mid.get("code")):
														
 
															-            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        exec(request.deal_detail)
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        if response is None:
														
 
															-            code = 0
														
 
															-        else:
														
 
															-            code = response.status_code
														
 
															-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
														
 
															-        if 200 <= code < 300:
														
 
															-            err = 'analysis'
														
 
															-        elif 300 <= code < 400:
														
 
															-            err = 'download'
														
 
															-        elif 400 <= code < 500:
														
 
															-            err = 'download'
														
 
															-        elif 500 <= code:
														
 
															-            err = "servers"
														
 
															-        else:
														
 
															-            err = "timeout"
														
 
															-        mgp = MgpListItem()
														
 
															-        mgp.code = code
														
 
															-        mgp.error = err
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key, items[key])
														
 
															-        mgp.failed += 1
														
 
															-        if mgp.pri is None:
														
 
															-            mgp.pri = 0
														
 
															-
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info = f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **爬虫等级:** {mgp.pri}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-    def download_midware(self, request):
														
 
															-        down_mid = request.down_mid
														
 
															-        key = down_mid.get("key")
														
 
															-        page_url = down_mid.get("page_url")
														
 
															-        cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-        request.cookies = cookie_pool.get_cookie()
														
 
															-        return request
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Details(redis_key="magp:details1").start()
														
--- a/FworkSpider/details/details_firefox.py
+++ b/FworkSpider/details/details_firefox.py
@@ -1,115 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-
														
 
															-import feapder
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-import execjs
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-
														
 
															-
														
 
															-
														
 
															-class FirefoxDetails(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_firefox"},sort={"date":-1})
														
 
															-            print(data_lsit)
														
 
															-            for item in data_lsit:
														
 
															-                print(item)
														
 
															-                request_params = item.get("request_params")
														
 
															-                if item.get("ex_python"):
														
 
															-                    exec(item.get("ex_python"))
														
 
															-
														
 
															-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
														
 
															-                                      deal_detail=item.get("deal_detail"),**request_params,
														
 
															-                                      callback=eval(item.get("parse")),base_info=item,render=True,
														
 
															-                                      render_time=item.get("render_time"))
														
 
															-                self.to_db.delete(self.db_name,item)
														
 
															-            break
														
 
															-
														
 
															-    def detail_get(self,request,response):
														
 
															-        print(response.text)
														
 
															-        items = request.item
														
 
															-        # print(items)
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        for xpath in request.deal_detail:
														
 
															-            html = response.xpath(xpath).extract_first()  # 标书详细内容
														
 
															-            if html is not None:
														
 
															-                break
														
 
															-        list_item.contenthtml = html
														
 
															-        yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        if response is None:
														
 
															-            code = 0
														
 
															-        code = response.status_code
														
 
															-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
														
 
															-        if 200 <= code < 300:
														
 
															-            err = 'analysis'
														
 
															-        elif 300 <= code < 400:
														
 
															-            err = 'download'
														
 
															-        elif 400 <= code < 500:
														
 
															-            err = 'download'
														
 
															-        elif 500 <= code:
														
 
															-            err = "servers"
														
 
															-        else:
														
 
															-            err = "timeout"
														
 
															-        mgp = MgpListItem()
														
 
															-        mgp.code = code
														
 
															-        mgp.error = err
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key, items[key])
														
 
															-        mgp.failed += 1
														
 
															-        if mgp.pri is None:
														
 
															-            mgp.pri = 0
														
 
															-
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info = f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **爬虫等级:** {mgp.pri}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
														
 
															-    # def download_midware(self, request):
														
 
															-    #     request.proxies = self.prox_pool.get()
														
 
															-    #     return request
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    FirefoxDetails(redis_key="magp:details:firefox").start()
														
--- a/FworkSpider/details/details_login.py
+++ b/FworkSpider/details/details_login.py
@@ -1,150 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:  生成一定有效期cookie，并使用的detail 详情处理方案，默认不限制ip
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-
														
 
															-import feapder
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-import execjs
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-
														
 
															-from untils.cookie_pool import LoginCookiePool
														
 
															-import copy
														
 
															-
														
 
															-class Details(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
														
 
															-            for item in data_lsit:
														
 
															-                request_params = item.get("request_params")
														
 
															-                down_mid = copy.copy(item.get("down_mid"))
														
 
															-                key = down_mid.get("key")
														
 
															-                page_url = down_mid.get("page_url")
														
 
															-                cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-                down_mid["cookie_pool"] = cookie_pool
														
 
															-                print(down_mid)
														
 
															-
														
 
															-                if item.get("ex_python"):
														
 
															-                    exec(item.get("ex_python"))
														
 
															-
														
 
															-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
														
 
															-                                      deal_detail=item.get("deal_detail"),**request_params,
														
 
															-                                      callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
														
 
															-                self.to_db.delete(self.db_name,item)
														
 
															-            break
														
 
															-
														
 
															-
														
 
															-
														
 
															-    def detail_get(self,request,response):
														
 
															-        '''处理html格式的返回结果'''
														
 
															-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
														
 
															-            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.get("down_mid"))
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        if response.code in (request.down_mid.get("code")):
														
 
															-            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.get("down_mid"))
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        for xpath in request.deal_detail:
														
 
															-            html = response.xpath(xpath).extract_first()  # 标书详细内容
														
 
															-            if html is not None:
														
 
															-                break
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        yield list_item
														
 
															-
														
 
															-    def detail_json(self,request,response):
														
 
															-        '''处理json串及其他格式的返回结果'''
														
 
															-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
														
 
															-            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.get("down_mid"))
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        if response.code in (request.down_mid.get("code")):
														
 
															-            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.get("down_mid"))
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        exec(request.deal_detail)
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        mgp = MgpListItem()
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key,items[key])
														
 
															-        mgp.failed +=1
														
 
															-        print(f'......{mgp.failed}')
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info= f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-    def download_midware(self, request):
														
 
															-        down_mid = request.down_mid
														
 
															-        key = down_mid.get("key")
														
 
															-        page_url = down_mid.get("page_url")
														
 
															-        cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-        request.cookies = cookie_pool.get_cookie()
														
 
															-        return request
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Details(redis_key="magp:details1").start()
														
--- a/FworkSpider/details/dtcookie_pool.py
+++ b/FworkSpider/details/dtcookie_pool.py
@@ -1,88 +0,0 @@
 
															-import json
														
 
															-import re
														
 
															-import sys
														
 
															-
														
 
															-import execjs
														
 
															-
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-from untils.cookie_pool import PageCookiePool
														
 
															-import requests
														
 
															-
														
 
															-
														
 
															-class DTCookiePool(PageCookiePool):
														
 
															-    def __init__(self,redis_key,header,page_url=None,
														
 
															-        min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs):
														
 
															-        super(DTCookiePool, self).__init__(redis_key,page_url=None,
														
 
															-        min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs)
														
 
															-        self.headers=header
														
 
															-        self.page_url = page_url
														
 
															-
														
 
															-    def create_cookie(self,):
														
 
															-        session = requests.Session()
														
 
															-        start_url = self.page_url
														
 
															-        print(self.headers)
														
 
															-        res = session.get(start_url, headers=self.headers,verify=False)
														
 
															-        js_func = re.findall("document.cookie=(.*?)location.href", res.text)[0]
														
 
															-        js_func = 'function sd() { return ' + js_func + "}"
														
 
															-        ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
														
 
															-        ss = ctx.call("sd")
														
 
															-        cookies = {}
														
 
															-
														
 
															-        for item in ss.split(";"):
														
 
															-            if '=' in item:
														
 
															-                cookies[item.split("=")[0]] = item.split("=")[-1]
														
 
															-        res = session.get(start_url, cookies=cookies, headers=self.headers)
														
 
															-        js_do_data = re.findall('};go\((.*?)\)', res.text)[0]
														
 
															-        js_func = re.sub("<(/*?)script>", "", res.text)
														
 
															-        location = re.compile('location(.*?)}else')
														
 
															-        setTimeout = re.compile('_(.{37})setTimeout(.*?)document')
														
 
															-        setTimeout2 = re.compile('setTimeout(.*?)document')
														
 
															-        gox = re.compile('};go(.*?)\)')
														
 
															-        js_func = re.sub(location, "}else", js_func)
														
 
															-        js_func = re.sub(setTimeout, "       document", js_func)
														
 
															-        js_func = re.sub(setTimeout2, "       document", js_func)
														
 
															-        js_func = re.sub(gox, "   return document['cookie']\n};", js_func)
														
 
															-        js_func = '''const jsdom = require("jsdom");
														
 
															-        const {JSDOM} = jsdom;
														
 
															-        const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
														
 
															-        window = dom.window;
														
 
															-        document = window.document;''' + js_func
														
 
															-        ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
														
 
															-        with open('ex_js.js', 'w+', encoding='utf-8') as f:
														
 
															-            f.write(js_func)
														
 
															-        print(js_do_data)
														
 
															-        ss = ctx.call("go", json.loads(js_do_data))
														
 
															-
														
 
															-        for item in ss.split(";"):
														
 
															-            if '=' in item:
														
 
															-                cookies[item.split("=")[0]] = item.split("=")[-1]
														
 
															-                session.cookies.setdefault(item.split("=")[0], item.split("=")[-1])
														
 
															-        res = session.get(start_url, headers=self.headers, cookies=cookies)
														
 
															-        cookies = requests.utils.dict_from_cookiejar(session.cookies)
														
 
															-        return cookies
														
 
															-
														
 
															-if __name__ == '__main__':
														
 
															-    headers = {
														
 
															-    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
														
 
															-    "Accept-Encoding": "gzip, deflate, br",
														
 
															-    "Accept-Language": "zh-CN,zh;q=0.9",
														
 
															-    "Cache-Control": "max-age=0",
														
 
															-    "Connection": "keep-alive",
														
 
															-    "Host": "www.hefei.gov.cn",
														
 
															-    "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"96\", \"Google Chrome\";v=\"96\"",
														
 
															-    "sec-ch-ua-mobile": "?0",
														
 
															-    "sec-ch-ua-platform": "\"Windows\"",
														
 
															-    "Sec-Fetch-Dest": "document",
														
 
															-    "Sec-Fetch-Mode": "navigate",
														
 
															-    "Sec-Fetch-Site": "none",
														
 
															-    "Sec-Fetch-User": "?1",
														
 
															-    "Upgrade-Insecure-Requests": "1",
														
 
															-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
														
 
															-}
														
 
															-
														
 
															-    cookie_pool = DTCookiePool(
														
 
															-        page_url='https://www.hefei.gov.cn/public/column/5921?catId=6721141&nav=3&action=list&type=4&pageIndex=2',
														
 
															-        header=headers, redis_key="dongtaices")
														
 
															-    cookie = cookie_pool.get_cookie()
														
 
															-    print(cookie)
														
 
															-    # cookie_pool.del_cookie(cookie)
														
--- a/FworkSpider/details/file/sj.js
+++ b/FworkSpider/details/file/sj.js
--- a/FworkSpider/feapder/buffer/item_buffer.py
+++ b/FworkSpider/feapder/buffer/item_buffer.py
@@ -99,9 +99,9 @@ class ItemBuffer(threading.Thread):
 
															         return self._mysql_pipeline
														
 
															-    def run(self):
														
 
															+    def run(self): # step 1 开始
														
 
															         self._thread_stop = False
														
 
															-        while not self._thread_stop:
														
 
															+        while not self._thread_stop: # 爬虫不停止，就一直循环刷新
														
 
															             self.flush()
														
 
															             tools.delay_time(1)
														
@@ -111,13 +111,18 @@ class ItemBuffer(threading.Thread):
 
															         self._thread_stop = True
														
 
															         self._started.clear()
														
 
															-    def put_item(self, item):
														
 
															+    def put_item(self, item): # step 存储数据的入口 将需要存储的数据放入数据管道队列
														
 
															         if isinstance(item, Item):
														
 
															             # 入库前的回调
														
 
															-            item.pre_to_db()
														
 
															-
														
 
															-        self._items_queue.put(item)
														
 
															+            if item.item_name == "ListItem":  # 测试框架有用，对listitem不进行存储，正式框架没有这个判断
														
 
															+                return
														
 
															+            item.pre_to_db()
														
 
															+            # print(item)
														
 
															+            if item.save: # 根据save字段，判断该条信息是否存储
														
 
															+                self._items_queue.put(item)
														
 
															+        else:
														
 
															+            self._items_queue.put(item)
														
 
															     def flush(self):
														
 
															         try:
														
 
															             items = []
														
@@ -127,26 +132,26 @@ class ItemBuffer(threading.Thread):
 
															             items_fingerprints = []
														
 
															             data_count = 0
														
 
															-            while not self._items_queue.empty():
														
 
															-                data = self._items_queue.get_nowait()
														
 
															+            while not self._items_queue.empty(): # step 2 数据管道队列不为空时时 不等待直接取值
														
 
															+                data = self._items_queue.get_nowait() # 队列的 不等待直接取值方法，类似get
														
 
															                 data_count += 1
														
 
															                 # data 分类
														
 
															                 if callable(data):
														
 
															                     callbacks.append(data)
														
 
															-                elif isinstance(data, UpdateItem):
														
 
															+                elif isinstance(data, UpdateItem): # 更新型数据，走更新管道，采集框架只存不更新，可以忽略不看
														
 
															                     update_items.append(data)
														
 
															                 elif isinstance(data, Item):
														
 
															                     items.append(data)
														
 
															-                    if setting.ITEM_FILTER_ENABLE:
														
 
															+                    if setting.ITEM_FILTER_ENABLE: # item去重，对于当前框架，无效，不看
														
 
															                         items_fingerprints.append(data.fingerprint)
														
 
															                 else:  # request-redis
														
 
															                     requests.append(data)
														
 
															-                if data_count >= UPLOAD_BATCH_MAX_SIZE:
														
 
															+                if data_count >= UPLOAD_BATCH_MAX_SIZE: # step 3 需要存储的数据，达到一定数量后，统一存储
														
 
															                     self.__add_item_to_db(
														
 
															                         items, update_items, requests, callbacks, items_fingerprints
														
 
															                     )
														
@@ -158,7 +163,7 @@ class ItemBuffer(threading.Thread):
 
															                     items_fingerprints = []
														
 
															                     data_count = 0
														
 
															-            if data_count:
														
 
															+            if data_count: # step 3 管道为空后，将剩余的数据，统一存储
														
 
															                 self.__add_item_to_db(
														
 
															                     items, update_items, requests, callbacks, items_fingerprints
														
 
															                 )
														
@@ -243,11 +248,11 @@ class ItemBuffer(threading.Thread):
 
															         return datas_dict
														
 
															     def __export_to_db(self, table, datas, is_update=False, update_keys=()):
														
 
															-        # 打点 校验
														
 
															+        # step 3.1.1 打点 记录总条数及每个key情况
														
 
															         self.check_datas(table=table, datas=datas)
														
 
															-        for pipeline in self._pipelines:
														
 
															-            if is_update:
														
 
															+        for pipeline in self._pipelines: # setting 配置的piplines方法
														
 
															+            if is_update: # 更新方法 不看
														
 
															                 if table == self._task_table and not isinstance(
														
 
															                     pipeline, MysqlPipeline
														
 
															                 ):
														
@@ -260,7 +265,7 @@ class ItemBuffer(threading.Thread):
 
															                     return False
														
 
															             else:
														
 
															-                if not pipeline.save_items(table, datas):
														
 
															+                if not pipeline.save_items(table, datas): # step 3.1.2 调用pipline的 save_items 方法
														
 
															                     log.error(
														
 
															                         f"{pipeline.__class__.__name__} 保存数据失败. table: {table}  items: {datas}"
														
 
															                     )
														
@@ -284,11 +289,11 @@ class ItemBuffer(threading.Thread):
 
															         export_success = True
														
 
															         self._is_adding_to_db = True
														
 
															-        # 去重
														
 
															+        # 去重 item去重，不看
														
 
															         if setting.ITEM_FILTER_ENABLE:
														
 
															             items, items_fingerprints = self.__dedup_items(items, items_fingerprints)
														
 
															-        # 分捡
														
 
															+        # step 分捡 将每个表之间的数据分开 拆分后 原items为空
														
 
															         items_dict = self.__pick_items(items)
														
 
															         update_items_dict = self.__pick_items(update_items, is_update_item=True)
														
@@ -306,7 +311,7 @@ class ItemBuffer(threading.Thread):
 
															                 % (table, tools.dumps_json(datas, indent=16))
														
 
															             )
														
 
															-            if not self.__export_to_db(table, datas):
														
 
															+            if not self.__export_to_db(table, datas): # step 3.1 导出到数据库
														
 
															                 export_success = False
														
 
															                 failed_items["add"].append({"table": table, "datas": datas})
														
@@ -331,7 +336,7 @@ class ItemBuffer(threading.Thread):
 
															                 failed_items["update"].append({"table": table, "datas": datas})
														
 
															         if export_success:
														
 
															-            # 执行回调
														
 
															+            # step 3.2 保存成功后，执行的执行回调
														
 
															             while callbacks:
														
 
															                 try:
														
 
															                     callback = callbacks.pop(0)
														
@@ -339,15 +344,17 @@ class ItemBuffer(threading.Thread):
 
															                 except Exception as e:
														
 
															                     log.exception(e)
														
 
															-            # 删除做过的request
														
 
															+            # step 删除做过的request
														
 
															             if requests:
														
 
															                 self.redis_db.zrem(self._table_request, requests)
														
 
															-            # 去重入库
														
 
															+            # 去重入库 不走这个去重
														
 
															             if setting.ITEM_FILTER_ENABLE:
														
 
															                 if items_fingerprints:
														
 
															                     self.__class__.dedup.add(items_fingerprints, skip_check=True)
														
 
															         else:
														
 
															+            # step 3.2 保存失败后，执行的执行回调
														
 
															+
														
 
															             failed_items["requests"] = requests
														
 
															             if self.export_retry_times > setting.EXPORT_DATA_MAX_RETRY_TIMES:
														
--- a/FworkSpider/feapder/buffer/request_buffer.py
+++ b/FworkSpider/feapder/buffer/request_buffer.py
@@ -44,9 +44,9 @@ class RequestBuffer(threading.Thread):
 
															                     name=redis_key, to_md5=False, **setting.REQUEST_FILTER_SETTING
														
 
															                 )  # 默认过期时间为一个月
														
 
															-    def run(self):
														
 
															+    def run(self): # step 1 线程入口
														
 
															         self._thread_stop = False
														
 
															-        while not self._thread_stop:
														
 
															+        while not self._thread_stop: # 每隔一分钟进行一次 将产生的任务存储
														
 
															             try:
														
 
															                 self.__add_request_to_db()
														
 
															             except Exception as e:
														
@@ -94,7 +94,7 @@ class RequestBuffer(threading.Thread):
 
															         callbacks = []
														
 
															         while self._requests_deque:
														
 
															-            request = self._requests_deque.popleft()
														
 
															+            request = self._requests_deque.popleft() # 从任务队列中从左取任务（先进先出）
														
 
															             self._is_adding_to_db = True
														
 
															             if callable(request):
														
--- a/FworkSpider/feapder/commands/create_builder.py
+++ b/FworkSpider/feapder/commands/create_builder.py
@@ -20,7 +20,7 @@ def main():
 
															         "-p", "--project", help="创建项目 如 feapder create -p <project_name>", metavar=""
														
 
															     )
														
 
															     spider.add_argument(
														
 
															-        "-s",
														
 
															+        "--s",
														
 
															         "--spider",
														
 
															         nargs="+",
														
 
															         help="创建爬虫\n"
														
--- a/FworkSpider/feapder/core/base_parser.py
+++ b/FworkSpider/feapder/core/base_parser.py
@@ -9,6 +9,8 @@ Created on 2018-07-25 11:41:57
 
															 """
														
 
															 import os
														
 
															 import traceback
														
 
															+
														
 
															+import feapder
														
 
															 import feapder.utils.tools as tools
														
 
															 from feapder.db.mysqldb import MysqlDB
														
 
															 from feapder.network.item import UpdateItem
														
@@ -89,12 +91,43 @@ class BaseParser(object):
 
															         """
														
 
															         pass
														
 
															+    def infinite_crawl(self,request,response):
														
 
															+        menu = request.item
														
 
															+        list_item = request.list_item
														
 
															+        if self.platform_next_page:  # real_page为连续翻页采集为0
														
 
															+            if getattr(request, 'real_page', None) is not None:
														
 
															+                request.real_page = 0
														
 
															+
														
 
															+            request.real_page += 1
														
 
															+            if list_item.rel_count > 0:
														
 
															+                request.real_page = 0
														
 
															+
														
 
															+            if request.real_page <= 5 and request.page < self.platform_max_page:
														
 
															+                request.page += 1
														
 
															+                request.callback = self.parse
														
 
															+                if getattr(request, 'new_callback', None) is not None:
														
 
															+                    request.callback = eval(request.new_callback)
														
 
															+                    yield request
														
 
															+        else:
														
 
															+            if request.page < menu.get("crawl_page"):
														
 
															+                request.page += 1
														
 
															+                request.callback = self.parse
														
 
															+                if getattr(request, 'new_callback', None) is not None:
														
 
															+                    request.callback = eval(request.new_callback)
														
 
															+                    yield request
														
 
															+
														
 
															     def push_files(self, request, response):
														
 
															         """
														
 
															         @summary: 下载 并上传附件文件，传进来的request的auto_request必须为False，否则可能会因为响应失败而无法下载文件
														
 
															         ---------
														
 
															         @param request:  request.url 为文件下载地址， 该方法需要自行调用
														
 
															         request.INFO  为上传文件时所需要提供的部分参数  必传
														
 
															+         info = {
														
 
															+            "org_url": "http://www...",  # 文件下载连接
														
 
															+            "filename": f"{list_item.title}.docx",  # 文件名
														
 
															+            "channel": list_item.channel,
														
 
															+            "ftype": 'docx,zip,ftp', # 文件类型
														
 
															+        }
														
 
															         request.headers 则存放请求的必要参数，如：parmas，headers  必传
														
 
															         ---------
														
 
															         @result: request / item / callback / None (返回值必须可迭代)，正常处理为 None 即可
														
--- a/FworkSpider/feapder/core/collector.py
+++ b/FworkSpider/feapder/core/collector.py
@@ -48,11 +48,11 @@ class Collector(threading.Thread):
 
															         self.__delete_dead_node()
														
 
															-    def run(self):
														
 
															+    def run(self):  # step 线程入口
														
 
															         self._thread_stop = False
														
 
															         while not self._thread_stop:
														
 
															             try:
														
 
															-                self.__report_node_heartbeat()
														
 
															+                self.__report_node_heartbeat() # step 汇报节点心跳
														
 
															                 self.__input_data()
														
 
															             except Exception as e:
														
 
															                 log.exception(e)
														
@@ -67,23 +67,24 @@ class Collector(threading.Thread):
 
															     def __input_data(self):
														
 
															         current_timestamp = tools.get_current_timestamp()
														
 
															-        if len(self._todo_requests) >= self._request_count:
														
 
															+        if len(self._todo_requests) >= self._request_count: # step 待执行任务数量>设置的任务数量上限 不处理
														
 
															             return
														
 
															         request_count = self._request_count  # 先赋值
														
 
															-        # 查询最近有心跳的节点数量
														
 
															+        # step 查询最近有心跳的节点数量
														
 
															         spider_count = self._db.zget_count(
														
 
															             self._tab_spider_status,
														
 
															             priority_min=current_timestamp - (self._interval + 10),
														
 
															             priority_max=current_timestamp,
														
 
															         )
														
 
															-        # 根据等待节点数量，动态分配request
														
 
															+        # step 根据等待节点数量，动态分配request
														
 
															         if spider_count:
														
 
															             # 任务数量
														
 
															             task_count = self._db.zget_count(self._tab_requests)
														
 
															             # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1
														
 
															             request_count = task_count // spider_count + 1
														
 
															+        # step 判断 request_count 数量是否大于 设置的上限 ，大于上限，重置
														
 
															         request_count = (
														
 
															             request_count
														
 
															             if request_count <= self._request_count
														
--- a/FworkSpider/feapder/core/parser_control.py
+++ b/FworkSpider/feapder/core/parser_control.py
@@ -46,11 +46,11 @@ class PaserControl(threading.Thread):
 
															         self._wait_task_time = 0
														
 
															-    def run(self):
														
 
															+    def run(self):  # step 1 开始
														
 
															         self._thread_stop = False
														
 
															         while not self._thread_stop:
														
 
															             try:
														
 
															-                requests = self._collector.get_requests(setting.SPIDER_TASK_COUNT)
														
 
															+                requests = self._collector.get_requests(setting.SPIDER_TASK_COUNT) # step 2 获取任务
														
 
															                 if not requests:
														
 
															                     if not self.is_show_tip:
														
 
															                         log.debug("parser 等待任务...")
														
@@ -63,7 +63,7 @@ class PaserControl(threading.Thread):
 
															                     continue
														
 
															                 self.is_show_tip = False
														
 
															-                self.deal_requests(requests)
														
 
															+                self.deal_requests(requests) # step 3 开始处理任务
														
 
															             except Exception as e:
														
 
															                 log.exception(e)
														
@@ -90,17 +90,17 @@ class PaserControl(threading.Thread):
 
															                 if parser.name == request.parser_name:
														
 
															                     used_download_midware_enable = False
														
 
															                     try:
														
 
															-                        # 记录需下载的文档
														
 
															+                        # step 4 记录需下载的文档
														
 
															                         self.record_download_status(
														
 
															                             PaserControl.DOWNLOAD_TOTAL, parser.name
														
 
															                         )
														
 
															-                        # 解析request
														
 
															+                        # step 5 解析request
														
 
															                         if request.auto_request:
														
 
															                             request_temp = None
														
 
															                             response = None
														
 
															-                            # 下载中间件
														
 
															+                            # step 6 运行下载中间件 分两种，一种爬虫自定义的中间件，一种通过request传过来的中间件方法
														
 
															                             if request.download_midware:
														
 
															                                 if isinstance(request.download_midware, (list, tuple)):
														
 
															                                     request_temp = request
														
@@ -122,10 +122,10 @@ class PaserControl(threading.Thread):
 
															                                         )
														
 
															                                     )
														
 
															                                     request_temp = download_midware(request)
														
 
															-                            elif request.download_midware != False:
														
 
															+                            elif request.download_midware != False: # nunder 没理解应用场景
														
 
															                                 request_temp = parser.download_midware(request)
														
 
															-                            # 请求
														
 
															+                            # step 7 开始处理请求
														
 
															                             if request_temp:
														
 
															                                 if (
														
 
															                                     isinstance(request_temp, (tuple, list))
														
@@ -150,10 +150,14 @@ class PaserControl(threading.Thread):
 
															                                             )
														
 
															                                         )
														
 
															                                     except Exception as e:
														
 
															-                                        log.info("requests", extra={"url": request.url or request_temp.url, "code": -1,"error_info":e})
														
 
															+                                        response = None
														
 
															+                                        log.info("requests", extra={"url": request.url or request_temp.url,"code": -1,"error_info":e})
														
 
															                                         raise Exception(
														
 
															-                                            "连接超时 url: %s" % (request.url or request_temp.url)
														
 
															+                                            "request 请求异常： %s url: %s" % (e,request.url or request_temp.url)
														
 
															                                         )
														
 
															+                                    except:
														
 
															+                                        response = None
														
 
															+                                        log.error("request 请求异常 url: %s" % (request.url or request_temp.url))
														
 
															                             else:
														
 
															                                 try:
														
@@ -165,23 +169,31 @@ class PaserControl(threading.Thread):
 
															                                         )
														
 
															                                     )
														
 
															                                 except Exception as e:
														
 
															+                                    response = None
														
 
															                                     log.info("requests", extra={"url": request.url or request_temp.url, "code": -1, "error_info": e})
														
 
															                                     raise Exception(
														
 
															-                                        "连接超时 url: %s" % (request.url or request_temp.url)
														
 
															+                                        "request 请求异常：%s url: %s" % (e,request.url or request_temp.url)
														
 
															                                     )
														
 
															+                                except:
														
 
															+                                    response = None
														
 
															+                                    log.error("request 请求异常 url: %s" % (request.url or request_temp.url))
														
 
															+                                    # raise Exception(
														
 
															+                                    #     "response 请求异常 url: %s" % (request.url or request_temp.url))
														
 
															+
														
 
															                             if response == None:
														
 
															                                 raise Exception(
														
 
															-                                    "连接超时 url: %s" % (request.url or request_temp.url)
														
 
															+                                    "request 请求异常，无法定位错误信息 url: %s" % (request.url or request_temp.url)
														
 
															                                 )
														
 
															                         else:
														
 
															                             response = None
														
 
															-                        # 校验
														
 
															+                        # step 8 校验response 可以脚本自定义
														
 
															+                        #  TODO 针对登录网站、动态cookie，可以考虑使用上 validate 方法，在 validate 方法中判断cookie是否正常
														
 
															                         if parser.validate(request, response) == False:
														
 
															                             continue
														
 
															-
														
 
															+                        #  step 9 走回调方法 如果有parser的回调函数，则用回调处理，否则默认用parser处理
														
 
															                         if request.callback:  # 如果有parser的回调函数，则用回调处理
														
 
															                             callback_parser = (
														
 
															                                 request.callback
														
@@ -198,9 +210,9 @@ class PaserControl(threading.Thread):
 
															                                 % (parser.name, request.callback or "parse")
														
 
															                             )
														
 
															-                        # 标识上一个result是什么
														
 
															+                        # step 标识上一个result是什么
														
 
															                         result_type = 0  # 0\1\2 (初始值\request\item)
														
 
															-                        # 此处判断是request 还是 item
														
 
															+                        #  step 10 判断 result 是request 还是 item
														
 
															                         for result in results or []:
														
 
															                             if isinstance(result, Request):
														
 
															                                 result_type = 1
														
@@ -208,28 +220,28 @@ class PaserControl(threading.Thread):
 
															                                 result.parser_name = result.parser_name or parser.name
														
 
															                                 # 判断是同步的callback还是异步的
														
 
															-                                if result.request_sync:  # 同步
														
 
															+                                if result.request_sync:  # 同步 就到此为止了
														
 
															                                     request_dict = {
														
 
															                                         "request_obj": result,
														
 
															                                         "request_redis": None,
														
 
															                                     }
														
 
															                                     requests.append(request_dict)
														
 
															                                 else:  # 异步
														
 
															-                                    # 将next_request 入库
														
 
															+                                    # step 10.1 将next_request 入库 进行下一个循环
														
 
															                                     self._request_buffer.put_request(result)
														
 
															                                     del_request_redis_after_request_to_db = True
														
 
															                             elif isinstance(result, Item):
														
 
															                                 result_type = 2
														
 
															-                                # 将item入库
														
 
															+                                # step 10.1 将item入库
														
 
															                                 self._item_buffer.put_item(result)
														
 
															-                                # 需删除正在做的request
														
 
															+                                # step 10.2 需删除正在做的request
														
 
															                                 del_request_redis_after_item_to_db = True
														
 
															-                            elif callable(result):  # result为可执行的无参函数
														
 
															+                            elif callable(result):  # nunder result为可执行的无参函数
														
 
															                                 if (
														
 
															                                     result_type == 2
														
 
															-                                ):  # item 的 callback，buffer里的item均入库后再执行
														
 
															+                                ):  # step 10.1 item 的 callback，buffer里的item均入库后再执行
														
 
															                                     self._item_buffer.put_item(result)
														
 
															                                     del_request_redis_after_item_to_db = True
														
@@ -393,7 +405,7 @@ class PaserControl(threading.Thread):
 
															                                 del_request_redis_after_request_to_db = True
														
 
															                     else:
														
 
															-                        # 记录下载成功的文档
														
 
															+                        # step 11 记录下载成功的文档
														
 
															                         self.record_download_status(
														
 
															                             PaserControl.DOWNLOAD_SUCCESS, parser.name
														
 
															                         )
														
@@ -425,7 +437,7 @@ class PaserControl(threading.Thread):
 
															                 else:
														
 
															                     self._request_buffer.put_del_request(request_redis)
														
 
															-        if setting.SPIDER_SLEEP_TIME:
														
 
															+        if setting.SPIDER_SLEEP_TIME: # 如果设置了取任务的休眠时间，这里会稍微等待一下
														
 
															             if (
														
 
															                 isinstance(setting.SPIDER_SLEEP_TIME, (tuple, list))
														
 
															                 and len(setting.SPIDER_SLEEP_TIME) == 2
														
@@ -559,8 +571,11 @@ class AirSpiderParserControl(PaserControl):
 
															                                 except Exception as e:
														
 
															                                     log.info("requests", extra={"url": request.url or request_temp.url, "code": -1, "error_info": e})
														
 
															                                     raise Exception(
														
 
															-                                        "连接超时 url: %s" % (request.url or request_temp.url)
														
 
															+                                        "565 连接超时 url: %s" % (request.url or request_temp.url)
														
 
															                                     )
														
 
															+                                except:
														
 
															+                                    raise Exception(
														
 
															+                                        "response 请求超时 url: %s" % (request.url or request_temp.url))
														
 
															                         else:
														
 
															                             response = None
														
--- a/FworkSpider/feapder/core/scheduler.py
+++ b/FworkSpider/feapder/core/scheduler.py
@@ -7,10 +7,13 @@ Created on 2017-01-09 10:38
 
															 @author: Boris
														
 
															 @email: boris_liu@foxmail.com
														
 
															 """
														
 
															+import json
														
 
															+import sys
														
 
															 import threading
														
 
															 import time
														
 
															 from collections import Iterable
														
 
															+
														
 
															 import feapder.setting as setting
														
 
															 import feapder.utils.tools as tools
														
 
															 from feapder.buffer.item_buffer import ItemBuffer
														
@@ -30,7 +33,9 @@ SPIDER_START_TIME_KEY = "spider_start_time"
 
															 SPIDER_END_TIME_KEY = "spider_end_time"
														
 
															 SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY = "last_task_count_record_time"
														
 
															-
														
 
															+class Obj(object):
														
 
															+    def __init__(self, dict_):
														
 
															+        self.__dict__.update(dict_)
														
 
															 class Scheduler(threading.Thread):
														
 
															     __custom_setting__ = {}
														
@@ -96,6 +101,7 @@ class Scheduler(threading.Thread):
 
															         if "auto_stop_when_spider_done" in kwargs:
														
 
															             self._keep_alive = not kwargs.get("auto_stop_when_spider_done")
														
 
															         else:
														
 
															+
														
 
															             self._keep_alive = (
														
 
															                 keep_alive if keep_alive is not None else setting.KEEP_ALIVE
														
 
															             )
														
@@ -164,18 +170,18 @@ class Scheduler(threading.Thread):
 
															         else:
														
 
															             raise ValueError("类型错误，爬虫需继承feapder.BaseParser或feapder.BatchParser")
														
 
															-    def run(self):
														
 
															-        if not self.is_reach_next_spider_time():
														
 
															+    def run(self):  # STEP 1 爬虫框架入口
														
 
															+        if not self.is_reach_next_spider_time(): # STEP 2 检测爬虫是否到达执行时间
														
 
															             return
														
 
															-        self._start()
														
 
															+        self._start() # STEP 3 开始运行爬虫
														
 
															-        while True:
														
 
															+        while True: # step 4 对爬虫状态的一个监控
														
 
															             try:
														
 
															-                if self.all_thread_is_done():
														
 
															+                if self.all_thread_is_done(): # Step 5 判断爬虫是否运行完成
														
 
															                     if not self._is_notify_end:
														
 
															                         self.spider_end()  # 跑完一轮
														
 
															-                        self.record_spider_state(
														
 
															+                        self.record_spider_state(  # step 6 应该是一个通知爬虫结束的方法
														
 
															                             spider_type=1,
														
 
															                             state=1,
														
 
															                             spider_end_time=tools.get_current_date(),
														
@@ -184,14 +190,14 @@ class Scheduler(threading.Thread):
 
															                         self._is_notify_end = True
														
 
															-                    if not self._keep_alive:
														
 
															+                    if not self._keep_alive: # step 7 如果不是常驻爬虫 停止所有线程
														
 
															                         self._stop_all_thread()
														
 
															                         break
														
 
															                 else:
														
 
															                     self._is_notify_end = False
														
 
															-                self.check_task_status()
														
 
															+                self.check_task_status() # step 8 检查任务状态，并进行告警通知
														
 
															             except Exception as e:
														
 
															                 log.exception(e)
														
@@ -221,13 +227,13 @@ class Scheduler(threading.Thread):
 
															                     raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
														
 
															                 result_type = 1
														
 
															-                for result in results or []:
														
 
															-                    if isinstance(result, Request):
														
 
															+                for result in results or []: # step 对yield 的数据进行判断处理
														
 
															+                    if isinstance(result, Request): # Request 加入到任务队列
														
 
															                         result.parser_name = result.parser_name or parser.name
														
 
															                         self._request_buffer.put_request(result)
														
 
															                         result_type = 1
														
 
															-                    elif isinstance(result, Item):
														
 
															+                    elif isinstance(result, Item): # Item 数据，存入到数据管道队列，等待存储
														
 
															                         self._item_buffer.put_item(result)
														
 
															                         result_type = 2
														
@@ -247,15 +253,16 @@ class Scheduler(threading.Thread):
 
															                 self._item_buffer.flush()
														
 
															     def _start(self):
														
 
															-        # 启动request_buffer
														
 
															-        self._request_buffer.start()
														
 
															-        # 启动item_buffer
														
 
															-        self._item_buffer.start()
														
 
															-        # 启动collector
														
 
															-        self._collector.start()
														
 
															+
														
 
															+        self._request_buffer.start()  # STEP 3.1 启动request_buffer -- 任务管理器， 负责缓冲添加到数据库中的request
														
 
															+
														
 
															+        self._item_buffer.start()  # STEP 3.2 启动item_buffer -- 管道管理器 责缓冲添加到数据库中的item， 由该manager统一添加。防止多线程同时访问数据库
														
 
															+
														
 
															+        self._collector.start()  # STEP 3.3 启动collector  -- 任务管理 ，根据节点和任务，平均分配给每个节点
														
 
															         # 启动parser control
														
 
															         for i in range(self._thread_count):
														
 
															+            # STEP 3.4 根据 任务管理器、redis_key，下载器，数据管道创建一个线程池
														
 
															             parser_control = self._parser_control_obj(
														
 
															                 self._collector,
														
 
															                 self._redis_key,
														
@@ -263,22 +270,22 @@ class Scheduler(threading.Thread):
 
															                 self._item_buffer,
														
 
															             )
														
 
															-            for parser in self._parsers:
														
 
															+            for parser in self._parsers:  # step 3.5 把所有任务放入线程池
														
 
															                 parser_control.add_parser(parser)
														
 
															-            parser_control.start()
														
 
															+            parser_control.start()  # STEP 3.6 根据线程池开辟一个线程
														
 
															             self._parser_controls.append(parser_control)
														
 
															-        # 下发任务 因为时间可能比较长，放到最后面
														
 
															+        # STEP 3.7下发任务 有消费线程之后开始读取任务
														
 
															         if setting.RETRY_FAILED_REQUESTS:
														
 
															             # 重设失败的任务, 不用加锁，原子性操作
														
 
															             handle_failed_requests = HandleFailedRequests(self._redis_key)
														
 
															             handle_failed_requests.reput_failed_requests_to_requests()
														
 
															-        # 下发新任务
														
 
															+        # STEP 3.8下发新任务 ，生产新任务
														
 
															         if self._auto_start_requests:  # 自动下发
														
 
															             if self.wait_lock:
														
 
															-                # 将添加任务处加锁，防止多进程之间添加重复的任务
														
 
															+                # Stress 将添加任务处加锁，防止多进程之间添加重复的任务
														
 
															                 with RedisLock(key=self._spider_name) as lock:
														
 
															                     if lock.locked:
														
 
															                         self.__add_task()
														
@@ -286,34 +293,34 @@ class Scheduler(threading.Thread):
 
															                 self.__add_task()
														
 
															     def all_thread_is_done(self):
														
 
															-        for i in range(3):  # 降低偶然性, 因为各个环节不是并发的，很有可能当时状态为假，但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
														
 
															-            # 检测 collector 状态
														
 
															+        for i in range(3):  # Stress 降低偶然性, 因为各个环节不是并发的，很有可能当时状态为假，但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
														
 
															+            # STEP 5.1 检测 collector 状态
														
 
															             if (
														
 
															                 self._collector.is_collector_task()
														
 
															                 or self._collector.get_requests_count() > 0
														
 
															             ):
														
 
															                 return False
														
 
															-            # 检测 parser_control 状态
														
 
															+            # STEP 5.2 检测 parser_control 状态
														
 
															             for parser_control in self._parser_controls:
														
 
															                 if not parser_control.is_not_task():
														
 
															                     return False
														
 
															-            # 检测 item_buffer 状态
														
 
															+            # STEP 5.3 检测 item_buffer 状态
														
 
															             if (
														
 
															                 self._item_buffer.get_items_count() > 0
														
 
															                 or self._item_buffer.is_adding_to_db()
														
 
															             ):
														
 
															                 return False
														
 
															-            # 检测 request_buffer 状态
														
 
															+            # STEP 5.4 检测 request_buffer 状态
														
 
															             if (
														
 
															                 self._request_buffer.get_requests_count() > 0
														
 
															                 or self._request_buffer.is_adding_to_db()
														
 
															             ):
														
 
															                 return False
														
 
															-            tools.delay_time(1)
														
 
															+            tools.delay_time(1) # 休眠一分钟
														
 
															         return True
														
@@ -322,16 +329,15 @@ class Scheduler(threading.Thread):
 
															         """
														
 
															         检查任务状态 预警
														
 
															         """
														
 
															-        # 每分钟检查一次
														
 
															+        # step 每分钟检查一次
														
 
															         now_time = time.time()
														
 
															-        if now_time - self._last_check_task_status_time > 30:
														
 
															+        if now_time - self._last_check_task_status_time > 60:
														
 
															             self._last_check_task_status_time = now_time
														
 
															         else:
														
 
															             return
														
 
															-        # 检查redis中任务状态，若连续20分钟内任务数量未发生变化（parser可能卡死），则发出报警信息
														
 
															+        # step 检查redis中任务状态，若连续20分钟内任务数量未发生变化（parser可能卡死），则发出报警信息
														
 
															         task_count = self._redisdb.zget_count(self._tab_requests)
														
 
															-        print(task_count)
														
 
															         if task_count:
														
 
															             if task_count != self._last_task_count:
														
@@ -342,7 +348,7 @@ class Scheduler(threading.Thread):
 
															                     tools.get_current_timestamp(),
														
 
															                 )  # 多进程会重复发消息， 使用reids记录上次统计时间
														
 
															             else:
														
 
															-                # 判断时间间隔是否超过20分钟
														
 
															+                # step 判断时间间隔是否超过20分钟
														
 
															                 lua = """
														
 
															                     -- local key = KEYS[1]
														
 
															                     local field = ARGV[1]
														
@@ -350,7 +356,7 @@ class Scheduler(threading.Thread):
 
															                     -- 取值
														
 
															                     local last_timestamp = redis.call('hget', KEYS[1], field)
														
 
															-                    if last_timestamp and current_timestamp - last_timestamp >= 600 then
														
 
															+                    if last_timestamp and current_timestamp - last_timestamp >= 1200 then
														
 
															                         return current_timestamp - last_timestamp -- 返回任务停滞时间 秒
														
 
															                     end
														
@@ -372,13 +378,11 @@ class Scheduler(threading.Thread):
 
															                 )
														
 
															                 if overtime:
														
 
															-                    # 发送报警
														
 
															+                    # step 记录日志，并发送报警
														
 
															                     msg = "{}  爬虫任务停滞 {}，请检查爬虫是否正常".format(
														
 
															                         self._spider_name, tools.format_seconds(overtime)
														
 
															                     )
														
 
															-                    log.error(msg)
														
 
															-                    log.error("爬虫任务异常停滞，爬虫将强制退出")
														
 
															-                    exit()
														
 
															+                    log.error(msg)  # TODO 这一步可以加一个print，在平台的日志框里输出
														
 
															                     self.send_msg(
														
 
															                         msg,
														
 
															                         level="error",
														
@@ -459,9 +463,20 @@ class Scheduler(threading.Thread):
 
															         self._started.clear()
														
 
															     def send_msg(self, msg, level="debug", message_prefix=""):
														
 
															+        #TODO 这个方法是消息预警，但如果每次都发送，会造成消息轰炸，所以采集框架的消息预警没有开启，
														
 
															+        # 后续优化方向，消息预警的内容可以通过接口，接受保存，并对内容紧急度进行分辨，紧急度高的消息，可以直接发送至微信群中，这里尽量不要直接存储，feapder
														
 
															+        # 框架不进行mongo的直接存储，只做查询操作
														
 
															         # log.debug("发送报警 level:{} msg{}".format(level, msg))
														
 
															         tools.send_msg(msg=msg, level=level, message_prefix=message_prefix)
														
 
															+    def get_argvs(self):
														
 
															+        argvs = {"next_page": False, "max_page": 10}
														
 
															+        for item in sys.argv[1:]:
														
 
															+            print(item)
														
 
															+            if item.startswith("--"):
														
 
															+                argvs[item.replace("--", "").split('=')[0]] = eval(item.split('=')[-1]) # 此处使用eval的原因是字符串转bool或int
														
 
															+        return json.loads(json.dumps(argvs), object_hook=Obj)
														
 
															+
														
 
															     def spider_begin(self):
														
 
															         """
														
 
															         @summary: start_monitor_task 方式启动，此函数与spider_end不在同一进程内，变量不可共享
														
@@ -474,6 +489,8 @@ class Scheduler(threading.Thread):
 
															             self._begin_callback()
														
 
															         for parser in self._parsers:
														
 
															+            parser.platform_next_page = self.get_argvs().next_page
														
 
															+            parser.platform_max_page = self.get_argvs().max_page
														
 
															             parser.start_callback()
														
 
															         # 记录开始时间
														
@@ -486,16 +503,16 @@ class Scheduler(threading.Thread):
 
															             # 发送消息
														
 
															             # self.send_msg("《%s》爬虫开始" % self._spider_name)
														
 
															-    def spider_end(self):
														
 
															+    def spider_end(self): # step end 爬虫结束时的一些操作
														
 
															         self.record_end_time()
														
 
															-        if self._end_callback:
														
 
															+        if self._end_callback:  # 系统自带的回调，如果自定义回调，则这个回调不会执行
														
 
															             self._end_callback()
														
 
															         for parser in self._parsers:
														
 
															             if not self._keep_alive:
														
 
															-                parser.close()
														
 
															-            parser.end_callback()
														
 
															+                parser.close() # 爬虫可自定义close
														
 
															+            parser.end_callback() # 调用结束回调函数，可在爬虫自定义
														
 
															         if not self._keep_alive:
														
 
															             # 关闭webdirver
														
@@ -537,10 +554,10 @@ class Scheduler(threading.Thread):
 
															                 self._tab_spider_time, SPIDER_END_TIME_KEY, current_timestamp
														
 
															             )
														
 
															-    def is_reach_next_spider_time(self):
														
 
															+    def is_reach_next_spider_time(self): # 如果没有设置爬虫的启动时间，这一块儿不需要管的
														
 
															         if not self._batch_interval:
														
 
															             return True
														
 
															-
														
 
															+        # 下面是对上次执行完成的时间和当前时间的一个校验，不在规定范围内则不启动爬虫，阻塞等待时间到达后再运行爬虫
														
 
															         last_spider_end_time = self._redisdb.hget(
														
 
															             self._tab_spider_time, SPIDER_END_TIME_KEY
														
 
															         )
														
--- a/FworkSpider/feapder/dedup/__init__.py
+++ b/FworkSpider/feapder/dedup/__init__.py
@@ -2,98 +2,48 @@
 
															 """
														
 
															 Created on 2018-12-13 21:08
														
 
															 ---------
														
 
															-@summary:
														
 
															+@summary:  sha256 redis集群去重，正式环境使用的去重方式
														
 
															 ---------
														
 
															 @author: Boris
														
 
															 @email: boris_liu@foxmail.com
														
 
															 """
														
 
															 import copy
														
 
															-from typing import Any, List, Union, Optional, Tuple, Callable
														
 
															-
														
 
															-from feapder.utils.tools import get_md5
														
 
															-from .bloomfilter import BloomFilter, ScalableBloomFilter
														
 
															-from .expirefilter import ExpireFilter
														
 
															-
														
 
															+from typing import Any, List, Union, Tuple, Callable
														
 
															+import rediscluster
														
 
															+from Crypto.Hash import SHA256
														
 
															+from feapder import setting
														
 
															 class Dedup:
														
 
															     BloomFilter = 1
														
 
															     MemoryFilter = 2
														
 
															     ExpireFilter = 3
														
 
															+    def __init__(self,ilter_type: int = BloomFilter):
														
 
															+        self._to_sha256 = True
														
 
															+        self._to_redis = None
														
 
															-    def __init__(self, filter_type: int = BloomFilter, to_md5: bool = True, **kwargs):
														
 
															-        """
														
 
															-        去重过滤器 集成BloomFilter、MemoryFilter、ExpireFilter
														
 
															-        Args:
														
 
															-            filter_type: 过滤器类型 BloomFilter
														
 
															-            name: 过滤器名称 该名称会默认以dedup作为前缀 dedup:expire_set:[name]/dedup:bloomfilter:[name]。 默认ExpireFilter name=过期时间; BloomFilter name=dedup:bloomfilter:bloomfilter
														
 
															-            absolute_name: 过滤器绝对名称 不会加dedup前缀，当此值不为空时name参数无效
														
 
															-            expire_time: ExpireFilter的过期时间 单位为秒，其他两种过滤器不用指定
														
 
															-            error_rate: BloomFilter/MemoryFilter的误判率 默认为0.00001
														
 
															-            to_md5: 去重前是否将数据转为MD5，默认是
														
 
															-            redis_url: redis://[[username]:[password]]@localhost:6379/0
														
 
															-                       BloomFilter 与 ExpireFilter 使用
														
 
															-                       默认会读取setting中的redis配置，若无setting，则需要专递redis_url
														
 
															-            initial_capacity: 单个布隆过滤器去重容量 默认100000000，当布隆过滤器容量满时会扩展下一个布隆过滤器
														
 
															-            error_rate：布隆过滤器的误判率 默认0.00001
														
 
															-            **kwargs:
														
 
															-        """
														
 
															-
														
 
															-        if filter_type == Dedup.ExpireFilter:
														
 
															-            try:
														
 
															-                expire_time = kwargs["expire_time"]
														
 
															-            except:
														
 
															-                raise ValueError("需传参数 expire_time")
														
 
															-
														
 
															-            name = kwargs.get("absolute_name") or "dedup:expire_set:%s" % kwargs.get(
														
 
															-                "name", expire_time
														
 
															-            )
														
 
															-            expire_time_record_key = "dedup:expire_set:expire_time"
														
 
															-
														
 
															-            self.dedup = ExpireFilter(
														
 
															-                name=name,
														
 
															-                expire_time=expire_time,
														
 
															-                expire_time_record_key=expire_time_record_key,
														
 
															-                redis_url=kwargs.get("redis_url"),
														
 
															-            )
														
 
															-
														
 
															-        else:
														
 
															-            initial_capacity = kwargs.get("initial_capacity", 100000000)
														
 
															-            error_rate = kwargs.get("error_rate", 0.00001)
														
 
															-            name = kwargs.get("absolute_name") or "dedup:bloomfilter:" + kwargs.get(
														
 
															-                "name", "bloomfilter"
														
 
															-            )
														
 
															-            if filter_type == Dedup.BloomFilter:
														
 
															-                self.dedup = ScalableBloomFilter(
														
 
															-                    name=name,
														
 
															-                    initial_capacity=initial_capacity,
														
 
															-                    error_rate=error_rate,
														
 
															-                    bitarray_type=ScalableBloomFilter.BASE_REDIS,
														
 
															-                    redis_url=kwargs.get("redis_url"),
														
 
															-                )
														
 
															-            elif filter_type == Dedup.MemoryFilter:
														
 
															-                self.dedup = ScalableBloomFilter(
														
 
															-                    name=name,
														
 
															-                    initial_capacity=initial_capacity,
														
 
															-                    error_rate=error_rate,
														
 
															-                    bitarray_type=ScalableBloomFilter.BASE_MEMORY,
														
 
															-                )
														
 
															-            else:
														
 
															-                raise ValueError(
														
 
															-                    "filter_type 类型错误，仅支持 Dedup.BloomFilter、Dedup.MemoryFilter、Dedup.ExpireFilter"
														
 
															-                )
														
 
															-
														
 
															-        self._to_md5 = to_md5
														
 
															+    @property
														
 
															+    def redis_cluster(self): # 连接redis集群
														
 
															+        if not self._to_redis:
														
 
															+            startup_nodes = [{"host": i.get("host"), "port": i.get("port")} for i in setting.REDISCLUSTER]
														
 
															+            self._to_redis =  rediscluster.RedisCluster(startup_nodes=startup_nodes, decode_responses=True)
														
 
															+        return self._to_redis
														
 
															     def __repr__(self):
														
 
															-        return str(self.dedup)
														
 
															-
														
 
															-    def _deal_datas(self, datas):
														
 
															-        if self._to_md5:
														
 
															+        return 'sha256'
														
 
															+    def sha256(self,info):
														
 
															+        if info is None:
														
 
															+            return ''
														
 
															+        res = SHA256.new(info.encode('utf-8'))
														
 
															+        data = res.hexdigest()
														
 
															+        return data
														
 
															+
														
 
															+    def _deal_datas(self, datas): # 对datas进行加密处理
														
 
															+        if self._to_sha256:
														
 
															             if isinstance(datas, list):
														
 
															-                keys = [get_md5(data) for data in datas]
														
 
															+                keys = [self.sha256(data) for data in datas]
														
 
															             else:
														
 
															-                keys = get_md5(datas)
														
 
															+                keys = self.sha256(datas)
														
 
															         else:
														
 
															             keys = copy.deepcopy(datas)
														
@@ -108,11 +58,35 @@ class Dedup:
 
															         @param skip_check: 是否直接添加，不检查是否存在 适用于bloomfilter，加快add速度
														
 
															         @return: list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
														
 
															         """
														
 
															-
														
 
															         keys = self._deal_datas(datas)
														
 
															-        is_added = self.dedup.add(keys, skip_check)
														
 
															+        is_added = self.insert_key(keys, skip_check)
														
 
															         return is_added
														
 
															+    def insert_key(self,keys,skip_check):
														
 
															+        if isinstance(keys, list):
														
 
															+            for key in keys:
														
 
															+                if not self.redis_cluster.exists("pylist_"+key):
														
 
															+                    self.redis_cluster.set("pylist_"+key, 1,ex=86400*365*2)
														
 
															+        else:
														
 
															+            if not self.redis_cluster.exists("pylist_"+keys):
														
 
															+                self.redis_cluster.set("pylist_"+keys,1,ex=86400*365*2)
														
 
															+
														
 
															+    def exists(self,keys):
														
 
															+        exists = []
														
 
															+        if isinstance(keys, list):
														
 
															+            for key in keys:
														
 
															+                exists.append(self.exit_key(key))
														
 
															+        else:
														
 
															+            exists.append(self.exit_key(keys))
														
 
															+        return exists
														
 
															+    def exit_key(self,key):
														
 
															+        if self.redis_cluster.exists(key):
														
 
															+            return True
														
 
															+        if self.redis_cluster.exists("pylist_"+key):
														
 
															+            return True
														
 
															+        return False
														
 
															+
														
 
															+
														
 
															     def get(self, datas: Union[List[Any], Any]) -> Union[List[Any], Any]:
														
 
															         """
														
@@ -121,58 +95,44 @@ class Dedup:
 
															         @return: list / 单个值 （存在返回1 不存在返回0)
														
 
															         """
														
 
															         keys = self._deal_datas(datas)
														
 
															-        is_exists = self.dedup.get(keys)
														
 
															+        is_exists = self.exists(keys)
														
 
															         return is_exists
														
 
															+
														
 
															     def filter_exist_data(
														
 
															         self,
														
 
															         datas: List[Any],
														
 
															         *,
														
 
															-        datas_fingerprints: Optional[List] = None,
														
 
															         callback: Callable[[Any], None] = None
														
 
															     ) -> Union[Tuple[List[Any], List[Any]], List[Any]]:
														
 
															         """
														
 
															         过滤掉已存在的数据
														
 
															-        *** 直接修改原来的数据 使用完此方法后 datas, datas_fingerprints 里面的值为去重后的数据
														
 
															-        @param datas_fingerprints: 数据的唯一指纹 列表
														
 
															         @param datas: 数据 列表
														
 
															         @param callback: 数据已存在时的回调 callback(data)
														
 
															         @return: None
														
 
															+        [0,1,1]
														
 
															+        [b,c,d]
														
 
															+        []
														
 
															         """
														
 
															-
														
 
															-        is_exists = self.get(datas_fingerprints or datas)
														
 
															-
														
 
															+        is_exists = self.get(datas)
														
 
															         dedup_datas = []
														
 
															+        while is_exists:
														
 
															+            data = datas.pop(0)
														
 
															+            is_exist = is_exists.pop(0)
														
 
															-        if datas_fingerprints:
														
 
															-            dedup_datas_fingerprints = []
														
 
															-            while is_exists:
														
 
															-                data = datas.pop(0)
														
 
															-                is_exist = is_exists.pop(0)
														
 
															-                data_fingerprint = datas_fingerprints.pop(0)
														
 
															-
														
 
															-                if not is_exist:
														
 
															-                    dedup_datas.append(data)
														
 
															-                    dedup_datas_fingerprints.append(data_fingerprint)
														
 
															-                else:
														
 
															-                    if callback:
														
 
															-                        callback(data)
														
 
															-
														
 
															-            datas_fingerprints.extend(dedup_datas_fingerprints)
														
 
															-            datas.extend(dedup_datas)
														
 
															-            return datas, datas_fingerprints
														
 
															-
														
 
															-        else:
														
 
															-            while is_exists:
														
 
															-                data = datas.pop(0)
														
 
															-                is_exist = is_exists.pop(0)
														
 
															-
														
 
															-                if not is_exist:
														
 
															-                    dedup_datas.append(data)
														
 
															-                else:
														
 
															-                    if callback:
														
 
															-                        callback(data)
														
 
															-
														
 
															-            datas.extend(dedup_datas)
														
 
															-            return datas
														
 
															+            if not is_exist:
														
 
															+                dedup_datas.append(data)
														
 
															+            else:
														
 
															+                if callback:
														
 
															+                    callback(data)
														
 
															+
														
 
															+        datas.extend(dedup_datas)
														
 
															+        return datas
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    dedup = Dedup(Dedup.BloomFilter)
														
 
															+    href = 'http://www.ccgp-tianjin.gov.cn/viewer.do?id=339715380&ver=2222'
														
 
															+    ss = dedup.filter_exist_data([href])
														
 
															+    # res = dedup.add([href,'llk'])
														
 
															+    print(ss)
														
--- a/FworkSpider/feapder/dedup/old__init__.py
+++ b/FworkSpider/feapder/dedup/old__init__.py
@@ -0,0 +1,178 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+Created on 2018-12-13 21:08
														
 
															+---------
														
 
															+@summary: 布隆去重，测试框架使用的去重方式
														
 
															+---------
														
 
															+@author: Boris
														
 
															+@email: boris_liu@foxmail.com
														
 
															+"""
														
 
															+
														
 
															+import copy
														
 
															+from typing import Any, List, Union, Optional, Tuple, Callable
														
 
															+
														
 
															+from feapder.utils.tools import get_md5
														
 
															+from .bloomfilter import BloomFilter, ScalableBloomFilter
														
 
															+from .expirefilter import ExpireFilter
														
 
															+
														
 
															+
														
 
															+class Dedup:
														
 
															+    BloomFilter = 1
														
 
															+    MemoryFilter = 2
														
 
															+    ExpireFilter = 3
														
 
															+
														
 
															+    def __init__(self, filter_type: int = BloomFilter, to_md5: bool = True, **kwargs):
														
 
															+        """
														
 
															+        去重过滤器 集成BloomFilter、MemoryFilter、ExpireFilter
														
 
															+        Args:
														
 
															+            filter_type: 过滤器类型 BloomFilter
														
 
															+            name: 过滤器名称 该名称会默认以dedup作为前缀 dedup:expire_set:[name]/dedup:bloomfilter:[name]。 默认ExpireFilter name=过期时间; BloomFilter name=dedup:bloomfilter:bloomfilter
														
 
															+            absolute_name: 过滤器绝对名称 不会加dedup前缀，当此值不为空时name参数无效
														
 
															+            expire_time: ExpireFilter的过期时间 单位为秒，其他两种过滤器不用指定
														
 
															+            error_rate: BloomFilter/MemoryFilter的误判率 默认为0.00001
														
 
															+            to_md5: 去重前是否将数据转为MD5，默认是
														
 
															+            redis_url: redis://[[username]:[password]]@localhost:6379/0
														
 
															+                       BloomFilter 与 ExpireFilter 使用
														
 
															+                       默认会读取setting中的redis配置，若无setting，则需要专递redis_url
														
 
															+            initial_capacity: 单个布隆过滤器去重容量 默认100000000，当布隆过滤器容量满时会扩展下一个布隆过滤器
														
 
															+            error_rate：布隆过滤器的误判率 默认0.00001
														
 
															+            **kwargs:
														
 
															+        """
														
 
															+
														
 
															+        if filter_type == Dedup.ExpireFilter:
														
 
															+            try:
														
 
															+                expire_time = kwargs["expire_time"]
														
 
															+            except:
														
 
															+                raise ValueError("需传参数 expire_time")
														
 
															+
														
 
															+            name = kwargs.get("absolute_name") or "dedup:expire_set:%s" % kwargs.get(
														
 
															+                "name", expire_time
														
 
															+            )
														
 
															+            expire_time_record_key = "dedup:expire_set:expire_time"
														
 
															+
														
 
															+            self.dedup = ExpireFilter(
														
 
															+                name=name,
														
 
															+                expire_time=expire_time,
														
 
															+                expire_time_record_key=expire_time_record_key,
														
 
															+                redis_url=kwargs.get("redis_url"),
														
 
															+            )
														
 
															+
														
 
															+        else:
														
 
															+            initial_capacity = kwargs.get("initial_capacity", 100000000)
														
 
															+            error_rate = kwargs.get("error_rate", 0.00001)
														
 
															+            name = kwargs.get("absolute_name") or "dedup:bloomfilter:" + kwargs.get(
														
 
															+                "name", "bloomfilter"
														
 
															+            )
														
 
															+            if filter_type == Dedup.BloomFilter:
														
 
															+                self.dedup = ScalableBloomFilter(
														
 
															+                    name=name,
														
 
															+                    initial_capacity=initial_capacity,
														
 
															+                    error_rate=error_rate,
														
 
															+                    bitarray_type=ScalableBloomFilter.BASE_REDIS,
														
 
															+                    redis_url=kwargs.get("redis_url"),
														
 
															+                )
														
 
															+            elif filter_type == Dedup.MemoryFilter:
														
 
															+                self.dedup = ScalableBloomFilter(
														
 
															+                    name=name,
														
 
															+                    initial_capacity=initial_capacity,
														
 
															+                    error_rate=error_rate,
														
 
															+                    bitarray_type=ScalableBloomFilter.BASE_MEMORY,
														
 
															+                )
														
 
															+            else:
														
 
															+                raise ValueError(
														
 
															+                    "filter_type 类型错误，仅支持 Dedup.BloomFilter、Dedup.MemoryFilter、Dedup.ExpireFilter"
														
 
															+                )
														
 
															+
														
 
															+        self._to_md5 = to_md5
														
 
															+
														
 
															+    def __repr__(self):
														
 
															+        return str(self.dedup)
														
 
															+
														
 
															+    def _deal_datas(self, datas):
														
 
															+        if self._to_md5:
														
 
															+            if isinstance(datas, list):
														
 
															+                keys = [get_md5(data) for data in datas]
														
 
															+            else:
														
 
															+                keys = get_md5(datas)
														
 
															+        else:
														
 
															+            keys = copy.deepcopy(datas)
														
 
															+
														
 
															+        return keys
														
 
															+
														
 
															+    def add(
														
 
															+        self, datas: Union[List[Any], Any], skip_check: bool = False
														
 
															+    ) -> Union[List[Any], Any]:
														
 
															+        """
														
 
															+        添加数据
														
 
															+        @param datas: list / 单个值
														
 
															+        @param skip_check: 是否直接添加，不检查是否存在 适用于bloomfilter，加快add速度
														
 
															+        @return: list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
														
 
															+        """
														
 
															+
														
 
															+        keys = self._deal_datas(datas)
														
 
															+        is_added = self.dedup.add(keys, skip_check)
														
 
															+
														
 
															+        return is_added
														
 
															+
														
 
															+    def get(self, datas: Union[List[Any], Any]) -> Union[List[Any], Any]:
														
 
															+        """
														
 
															+        检查数据是否存在
														
 
															+        @param datas: list / 单个值
														
 
															+        @return: list / 单个值 （存在返回1 不存在返回0)
														
 
															+        """
														
 
															+        keys = self._deal_datas(datas)
														
 
															+        is_exists = self.dedup.get(keys)
														
 
															+
														
 
															+        return is_exists
														
 
															+
														
 
															+    def filter_exist_data(
														
 
															+        self,
														
 
															+        datas: List[Any],
														
 
															+        *,
														
 
															+        datas_fingerprints: Optional[List] = None,
														
 
															+        callback: Callable[[Any], None] = None
														
 
															+    ) -> Union[Tuple[List[Any], List[Any]], List[Any]]:
														
 
															+        """
														
 
															+        过滤掉已存在的数据
														
 
															+        *** 直接修改原来的数据 使用完此方法后 datas, datas_fingerprints 里面的值为去重后的数据
														
 
															+        @param datas_fingerprints: 数据的唯一指纹 列表
														
 
															+        @param datas: 数据 列表
														
 
															+        @param callback: 数据已存在时的回调 callback(data)
														
 
															+        @return: None
														
 
															+        """
														
 
															+
														
 
															+        is_exists = self.get(datas_fingerprints or datas)
														
 
															+
														
 
															+        dedup_datas = []
														
 
															+
														
 
															+        if datas_fingerprints:
														
 
															+            dedup_datas_fingerprints = []
														
 
															+            while is_exists:
														
 
															+                data = datas.pop(0)
														
 
															+                is_exist = is_exists.pop(0)
														
 
															+                data_fingerprint = datas_fingerprints.pop(0)
														
 
															+
														
 
															+                if not is_exist:
														
 
															+                    dedup_datas.append(data)
														
 
															+                    dedup_datas_fingerprints.append(data_fingerprint)
														
 
															+                else:
														
 
															+                    if callback:
														
 
															+                        callback(data)
														
 
															+
														
 
															+            datas_fingerprints.extend(dedup_datas_fingerprints)
														
 
															+            datas.extend(dedup_datas)
														
 
															+            return datas, datas_fingerprints
														
 
															+
														
 
															+        else:
														
 
															+            while is_exists:
														
 
															+                data = datas.pop(0)
														
 
															+                is_exist = is_exists.pop(0)
														
 
															+
														
 
															+                if not is_exist:
														
 
															+                    dedup_datas.append(data)
														
 
															+                else:
														
 
															+                    if callback:
														
 
															+                        callback(data)
														
 
															+
														
 
															+            datas.extend(dedup_datas)
														
 
															+            return datas
														
--- a/FworkSpider/feapder/network/cookie_pool.py
+++ b/FworkSpider/feapder/network/cookie_pool.py
@@ -103,9 +103,7 @@ class PageCookiePool(CookiePoolInterface):
 
															         """
														
 
															         with WebDriver(**self._kwargs) as driver:
														
 
															             driver.get(self._page_url)
														
 
															-
														
 
															             cookies = driver.get_cookies()
														
 
															-
														
 
															             cookies_json = {}
														
 
															             for cookie in cookies:
														
 
															                 cookies_json[cookie["name"]] = cookie["value"]
														
@@ -242,7 +240,7 @@ class LoginCookiePool(CookiePoolInterface):
 
															         self._password_key = password_key
														
 
															         self._redisdb = RedisDB()
														
 
															-        self._mysqldb = MysqlDB()
														
 
															+        self._mysqldb = ()
														
 
															         self.create_userbase()
														
--- a/FworkSpider/feapder/network/proxy_pool.py
+++ b/FworkSpider/feapder/network/proxy_pool.py
@@ -1,6 +1,6 @@
 
															 # coding:utf8
														
 
															 """
														
 
															-代理池
														
 
															+代理池  弃用
														
 
															 """
														
 
															 import datetime
														
 
															 import json
														
--- a/FworkSpider/feapder/network/request.py
+++ b/FworkSpider/feapder/network/request.py
@@ -31,7 +31,6 @@ class Request(object):
 
															     session = None
														
 
															     webdriver_pool: WebDriverPool = None
														
 
															     user_agent_pool = user_agent
														
 
															-    proxies_pool: ProxyPool = None
														
 
															     cache_db = None  # redis / pika
														
 
															     cached_redis_key = None  # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
														
@@ -91,6 +90,8 @@ class Request(object):
 
															         is_abandoned=False,
														
 
															         render=False,
														
 
															         render_time=0,
														
 
															+        splash=False,
														
 
															+        iframes=0,
														
 
															         **kwargs,
														
 
															     ):
														
 
															         """
														
@@ -146,6 +147,8 @@ class Request(object):
 
															         self.download_midware = download_midware
														
 
															         self.is_abandoned = is_abandoned
														
 
															         self.render = render
														
 
															+        self.splash = splash
														
 
															+        self.iframes = iframes
														
 
															         self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
														
 
															         self.requests_kwargs = {}
														
@@ -197,12 +200,6 @@ class Request(object):
 
															         return self.__class__.webdriver_pool
														
 
															-    @property
														
 
															-    def _proxies_pool(self):
														
 
															-        if not self.__class__.proxies_pool:
														
 
															-            self.__class__.proxies_pool = ProxyPool()
														
 
															-
														
 
															-        return self.__class__.proxies_pool
														
 
															     @property
														
 
															     def to_dict(self):
														
@@ -295,14 +292,15 @@ class Request(object):
 
															         # 代理
														
 
															         proxies = self.requests_kwargs.get("proxies", -1)
														
 
															-        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
														
 
															-            while True:
														
 
															-                proxies = self._proxies_pool.get()
														
 
															-                if proxies:
														
 
															-                    self.requests_kwargs.update(proxies=proxies)
														
 
															-                    break
														
 
															-                else:
														
 
															-                    log.debug("暂无可用代理 ...")
														
 
															+        if not self.render:
														
 
															+            if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
														
 
															+                while True:
														
 
															+                    proxies = self.get_proxy()
														
 
															+                    if proxies:
														
 
															+                        self.requests_kwargs.update(proxies=proxies)
														
 
															+                        break
														
 
															+                    else:
														
 
															+                        log.debug("暂无可用代理 ...")
														
 
															         log.debug(
														
 
															             """
														
@@ -331,10 +329,6 @@ class Request(object):
 
															             )
														
 
															         )
														
 
															-        # def hooks(response, *args, **kwargs):
														
 
															-        #     print(response.url)
														
 
															-        #
														
 
															-        # self.requests_kwargs.update(hooks={'response': hooks})
														
 
															         use_session = (
														
 
															             setting.USE_SESSION if self.use_session is None else self.use_session
														
@@ -353,15 +347,12 @@ class Request(object):
 
															                 if cookie_str:
														
 
															                     cookies = tools.get_cookies_from_str(cookie_str)
														
 
															-            proxy = None
														
 
															-            if proxies and proxies != -1:
														
 
															-                proxy = proxies.get("http", "").strip("http://") or proxies.get(
														
 
															-                    "https", ""
														
 
															-                ).strip("https://")
														
 
															-            browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy)
														
 
															+            browser = self._webdriver_pool.get(user_agent=user_agent, proxy=False)
														
 
															             try:
														
 
															+                if proxies:
														
 
															+                    self.chage_ip(browser)
														
 
															                 browser.get(self.url)
														
 
															                 if cookies:
														
 
															                     browser.cookies = cookies
														
@@ -393,6 +384,49 @@ class Request(object):
 
															         elif use_session:
														
 
															             response = self._session.request(method, self.url, **self.requests_kwargs)
														
 
															             response = Response(response)
														
 
															+        elif self.splash:
														
 
															+            resp = requests.get(setting.JIANYU_SPLASH_URL, params={
														
 
															+                'iframes': self.iframes,
														
 
															+                'wait': self.render_time,
														
 
															+                'html': 1,
														
 
															+                'proxy': self.get_proxy().get("http"),
														
 
															+                'url': self.url
														
 
															+            })
														
 
															+
														
 
															+            response = Response(resp)
														
 
															+
														
 
															+            # if self.iframes:
														
 
															+            # # response = Response(resp)
														
 
															+            #     res = resp.json()
														
 
															+            #     response = Response.from_dict(
														
 
															+            #         {
														
 
															+            #             "url": self.url,
														
 
															+            #             "cookies": resp.cookies,
														
 
															+            #             "_content": res.get("html"),
														
 
															+            #             "status_code": 200,
														
 
															+            #             "resp": resp,
														
 
															+            #             "elapsed": 666,
														
 
															+            #             "headers":resp.headers
														
 
															+            #         }
														
 
															+            #     )
														
 
															+            # else:
														
 
															+            #     res = resp.json()
														
 
															+            #     html = res.get("html")
														
 
															+            #     for item in res.get("childFrames"):
														
 
															+            #         html += item.get("html")
														
 
															+            #
														
 
															+            #     response = Response.from_dict(
														
 
															+            #         {
														
 
															+            #             "url": self.url,
														
 
															+            #             "cookies": resp.cookies,
														
 
															+            #             "_content": html,
														
 
															+            #             "status_code": 200,
														
 
															+            #             "resp": res,
														
 
															+            #             "elapsed": 666,
														
 
															+            #             "headers": resp.headers
														
 
															+            #
														
 
															+            #         }
														
 
															+            #     )
														
 
															         else:
														
 
															             response = requests.request(method, self.url, **self.requests_kwargs)
														
 
															             response = Response(response)
														
@@ -404,9 +438,7 @@ class Request(object):
 
															     def proxies(self):
														
 
															         """
														
 
															-
														
 
															         Returns: {"https": "https://ip:port", "http": "http://ip:port"}
														
 
															-
														
 
															         """
														
 
															         return self.requests_kwargs.get("proxies")
														
@@ -422,6 +454,29 @@ class Request(object):
 
															                 "https", ""
														
 
															             ).strip("https://")
														
 
															+    def get_proxy(self):
														
 
															+        headers = {
														
 
															+            "Authorization": setting.JIANYU_PROXY_AUTHOR
														
 
															+        }
														
 
															+        proxy = requests.get(setting.JIANYU_PROXY_URL, headers=headers).json()
														
 
															+        print(f"切换代理：{proxy.get('data')}")
														
 
															+        return proxy.get("data")
														
 
															+
														
 
															+    def chage_ip(self,browser):
														
 
															+        ip = self.get_proxy().get("http")  # ip格式"127.0.0.1:80"
														
 
															+        ip = ip.split("//")[-1]
														
 
															+        browser.get("about:config")
														
 
															+        browser.find_element_by_id("warningButton").click()
														
 
															+        # js代码
														
 
															+        setupScript = '''var prefs = Components.classes["@mozilla.org/preferences-service;1"].getService(Components.interfaces.nsIPrefBranch);
														
 
															+        prefs.setIntPref("network.proxy.type", 1);
														
 
															+        prefs.setCharPref("network.proxy.socks", "%s");
														
 
															+        prefs.setIntPref("network.proxy.socks_port", "%s");
														
 
															+        ''' % (
														
 
															+        ip.split(':')[0], ip.split(':')[1])
														
 
															+        # 执行js
														
 
															+        browser.execute_script(setupScript)
														
 
															+
														
 
															     def user_agent(self):
														
 
															         headers = self.requests_kwargs.get("headers")
														
 
															         if headers:
														
@@ -490,6 +545,7 @@ class Request(object):
 
															             try:
														
 
															                 response_obj = self.get_response(save_cached=save_cached)
														
 
															             except FunctionTimedOut:
														
 
															+                response_obj = None
														
 
															                 log.info("请求超时")
														
 
															                 log.info("requests", extra={"url": self.url, "code": 0})
														
--- a/FworkSpider/feapder/network/request6.29.py
+++ b/FworkSpider/feapder/network/request6.29.py
@@ -0,0 +1,513 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+Created on 2018-07-25 11:49:08
														
 
															+---------
														
 
															+@summary: 请求结构体
														
 
															+---------
														
 
															+@author: Boris
														
 
															+@email:  boris_liu@foxmail.com
														
 
															+"""
														
 
															+
														
 
															+import requests
														
 
															+from func_timeout import func_set_timeout, FunctionTimedOut
														
 
															+from requests.adapters import HTTPAdapter
														
 
															+from requests.cookies import RequestsCookieJar
														
 
															+from requests.packages.urllib3.exceptions import InsecureRequestWarning
														
 
															+
														
 
															+import feapder.setting as setting
														
 
															+import feapder.utils.tools as tools
														
 
															+from feapder.db.redisdb import RedisDB
														
 
															+from feapder.network import user_agent
														
 
															+from feapder.network.proxy_pool import ProxyPool
														
 
															+from feapder.network.response import Response
														
 
															+from feapder.utils.log import Log
														
 
															+from feapder.utils.webdriver import WebDriverPool
														
 
															+log = Log()
														
 
															+# 屏蔽warning信息
														
 
															+requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
														
 
															+
														
 
															+
														
 
															+class Request(object):
														
 
															+    session = None
														
 
															+    webdriver_pool: WebDriverPool = None
														
 
															+    user_agent_pool = user_agent
														
 
															+    proxies_pool: ProxyPool = None
														
 
															+
														
 
															+    cache_db = None  # redis / pika
														
 
															+    cached_redis_key = None  # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
														
 
															+    cached_expire_time = 1200  # 缓存过期时间
														
 
															+
														
 
															+    local_filepath = None
														
 
															+    oss_handler = None
														
 
															+
														
 
															+    __REQUEST_ATTRS__ = {
														
 
															+        # 'method', 'url', 必须传递 不加入**kwargs中
														
 
															+        "params",
														
 
															+        "data",
														
 
															+        "headers",
														
 
															+        "cookies",
														
 
															+        "files",
														
 
															+        "auth",
														
 
															+        "timeout",
														
 
															+        "allow_redirects",
														
 
															+        "proxies",
														
 
															+        "hooks",
														
 
															+        "stream",
														
 
															+        "verify",
														
 
															+        "cert",
														
 
															+        "json",
														
 
															+    }
														
 
															+
														
 
															+    DEFAULT_KEY_VALUE = dict(
														
 
															+        url="",
														
 
															+        retry_times=0,
														
 
															+        priority=300,
														
 
															+        parser_name=None,
														
 
															+        callback=None,
														
 
															+        filter_repeat=True,
														
 
															+        auto_request=True,
														
 
															+        request_sync=False,
														
 
															+        use_session=None,
														
 
															+        random_user_agent=True,
														
 
															+        download_midware=None,
														
 
															+        is_abandoned=False,
														
 
															+        render=False,
														
 
															+        render_time=0,
														
 
															+    )
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        url="",
														
 
															+        retry_times=0,
														
 
															+        priority=300,
														
 
															+        parser_name=None,
														
 
															+        callback=None,
														
 
															+        filter_repeat=True,
														
 
															+        auto_request=True,
														
 
															+        request_sync=False,
														
 
															+        use_session=None,
														
 
															+        random_user_agent=True,
														
 
															+        download_midware=None,
														
 
															+        is_abandoned=False,
														
 
															+        render=False,
														
 
															+        render_time=0,
														
 
															+        **kwargs,
														
 
															+    ):
														
 
															+        """
														
 
															+        @summary: Request参数
														
 
															+        ---------
														
 
															+        框架参数
														
 
															+        @param url: 待抓取url
														
 
															+        @param retry_times: 当前重试次数
														
 
															+        @param priority: 优先级 越小越优先 默认300
														
 
															+        @param parser_name: 回调函数所在的类名 默认为当前类
														
 
															+        @param callback: 回调函数 可以是函数 也可是函数名（如想跨类回调时，parser_name指定那个类名，callback指定那个类想回调的方法名即可）
														
 
															+        @param filter_repeat: 是否需要去重 (True/False) 当setting中的REQUEST_FILTER_ENABLE设置为True时该参数生效 默认True
														
 
															+        @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空，需要自己去请求网页
														
 
															+        @param request_sync: 是否同步请求下载网页，默认异步。如果该请求url过期时间快，可设置为True，相当于yield的reqeust会立即响应，而不是去排队
														
 
															+        @param use_session: 是否使用session方式
														
 
															+        @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
														
 
															+        @param download_midware: 下载中间件。默认为parser中的download_midware
														
 
															+        @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
														
 
															+        @param render: 是否用浏览器渲染
														
 
															+        @param render_time: 渲染时长，即打开网页等待指定时间后再获取源码
														
 
															+        --
														
 
															+        以下参数与requests参数使用方式一致
														
 
															+        @param method: 请求方式，如POST或GET，默认根据data值是否为空来判断
														
 
															+        @param params: 请求参数
														
 
															+        @param data: 请求body
														
 
															+        @param json: 请求json字符串，同 json.dumps(data)
														
 
															+        @param headers:
														
 
															+        @param cookies: 字典 或 CookieJar 对象
														
 
															+        @param files:
														
 
															+        @param auth:
														
 
															+        @param timeout: (浮点或元组)等待服务器数据的超时限制，是一个浮点数，或是一个(connect timeout, read timeout) 元组
														
 
															+        @param allow_redirects : Boolean. True 表示允许跟踪 POST/PUT/DELETE 方法的重定向
														
 
															+        @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
														
 
															+        @param verify: 为 True 时将会验证 SSL 证书
														
 
															+        @param stream: 如果为 False，将会立即下载响应内容
														
 
															+        @param cert:
														
 
															+        --
														
 
															+        @param **kwargs: 其他值: 如 Request(item=item) 则item可直接用 request.item 取出
														
 
															+        ---------
														
 
															+        @result:
														
 
															+        """
														
 
															+
														
 
															+        self.url = url
														
 
															+        self.retry_times = retry_times
														
 
															+        self.priority = priority
														
 
															+        self.parser_name = parser_name
														
 
															+        self.callback = callback
														
 
															+        self.filter_repeat = filter_repeat
														
 
															+        self.auto_request = auto_request
														
 
															+        self.request_sync = request_sync
														
 
															+        self.use_session = use_session
														
 
															+        self.random_user_agent = random_user_agent
														
 
															+        self.download_midware = download_midware
														
 
															+        self.is_abandoned = is_abandoned
														
 
															+        self.render = render
														
 
															+        self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
														
 
															+
														
 
															+        self.requests_kwargs = {}
														
 
															+        for key, value in kwargs.items():
														
 
															+            if key in self.__class__.__REQUEST_ATTRS__:  # 取requests参数
														
 
															+                self.requests_kwargs[key] = value
														
 
															+
														
 
															+            self.__dict__[key] = value
														
 
															+
														
 
															+    def __repr__(self):
														
 
															+        try:
														
 
															+            return "<Request {}>".format(self.url)
														
 
															+        except:
														
 
															+            return "<Request {}>".format(str(self.to_dict)[:40])
														
 
															+
														
 
															+    def __setattr__(self, key, value):
														
 
															+        """
														
 
															+        针对 request.xxx = xxx 的形式，更新reqeust及内部参数值
														
 
															+        @param key:
														
 
															+        @param value:
														
 
															+        @return:
														
 
															+        """
														
 
															+        self.__dict__[key] = value
														
 
															+
														
 
															+        if key in self.__class__.__REQUEST_ATTRS__:
														
 
															+            self.requests_kwargs[key] = value
														
 
															+
														
 
															+    def __lt__(self, other):
														
 
															+        return self.priority < other.priority
														
 
															+
														
 
															+    @property
														
 
															+    def _session(self):
														
 
															+        use_session = (
														
 
															+            setting.USE_SESSION if self.use_session is None else self.use_session
														
 
															+        )  # self.use_session 优先级高
														
 
															+        if use_session and not self.__class__.session:
														
 
															+            self.__class__.session = requests.Session()
														
 
															+            # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
														
 
															+            http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
														
 
															+            # 任何使用该session会话的 HTTP 请求，只要其 URL 是以给定的前缀开头，该传输适配器就会被使用到。
														
 
															+            self.__class__.session.mount("http", http_adapter)
														
 
															+
														
 
															+        return self.__class__.session
														
 
															+
														
 
															+    @property
														
 
															+    def _webdriver_pool(self):
														
 
															+        if not self.__class__.webdriver_pool:
														
 
															+            self.__class__.webdriver_pool = WebDriverPool(**setting.WEBDRIVER)
														
 
															+
														
 
															+        return self.__class__.webdriver_pool
														
 
															+
														
 
															+    @property
														
 
															+    def _proxies_pool(self):
														
 
															+        if not self.__class__.proxies_pool:
														
 
															+            self.__class__.proxies_pool = ProxyPool()
														
 
															+
														
 
															+        return self.__class__.proxies_pool
														
 
															+
														
 
															+    @property
														
 
															+    def to_dict(self):
														
 
															+        request_dict = {}
														
 
															+
														
 
															+        self.callback = (
														
 
															+            getattr(self.callback, "__name__")
														
 
															+            if callable(self.callback)
														
 
															+            else self.callback
														
 
															+        )
														
 
															+        self.download_midware = (
														
 
															+            getattr(self.download_midware, "__name__")
														
 
															+            if callable(self.download_midware)
														
 
															+            else self.download_midware
														
 
															+        )
														
 
															+
														
 
															+        for key, value in self.__dict__.items():
														
 
															+            if (
														
 
															+                key in self.__class__.DEFAULT_KEY_VALUE
														
 
															+                and self.__class__.DEFAULT_KEY_VALUE.get(key) == value
														
 
															+                or key == "requests_kwargs"
														
 
															+            ):
														
 
															+                continue
														
 
															+
														
 
															+            if key in self.__class__.__REQUEST_ATTRS__:
														
 
															+                if not isinstance(
														
 
															+                    value, (bytes, bool, float, int, str, tuple, list, dict)
														
 
															+                ):
														
 
															+                    value = tools.dumps_obj(value)
														
 
															+            else:
														
 
															+                if not isinstance(value, (bytes, bool, float, int, str)):
														
 
															+                    value = tools.dumps_obj(value)
														
 
															+
														
 
															+            request_dict[key] = value
														
 
															+
														
 
															+        return request_dict
														
 
															+
														
 
															+    @property
														
 
															+    def callback_name(self):
														
 
															+        return (
														
 
															+            getattr(self.callback, "__name__")
														
 
															+            if callable(self.callback)
														
 
															+            else self.callback
														
 
															+        )
														
 
															+
														
 
															+    @func_set_timeout(30)
														
 
															+    def get_response(self, save_cached=False):
														
 
															+        """
														
 
															+        获取带有selector功能的response
														
 
															+        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
														
 
															+        @return:
														
 
															+        """
														
 
															+        # 设置超时默认时间
														
 
															+        self.requests_kwargs.setdefault(
														
 
															+            "timeout", setting.REQUEST_TIMEOUT
														
 
															+        )  # connect=22 read=22
														
 
															+
														
 
															+        # 设置stream
														
 
															+        # 默认情况下，当你进行网络请求后，响应体会立即被下载。你可以通过 stream 参数覆盖这个行为，推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点： stream 设为 True，Requests 无法将连接释放回连接池，除非你 消耗了所有的数据，或者调用了 Response.close。 这样会带来连接效率低下的问题。
														
 
															+        self.requests_kwargs.setdefault("stream", True)
														
 
															+
														
 
															+        # 关闭证书验证
														
 
															+        self.requests_kwargs.setdefault("verify", False)
														
 
															+
														
 
															+        # 设置请求方法
														
 
															+        method = self.__dict__.get("method")
														
 
															+        if not method:
														
 
															+            if "data" in self.requests_kwargs:
														
 
															+                method = "POST"
														
 
															+            else:
														
 
															+                method = "GET"
														
 
															+
														
 
															+        # 随机user—agent
														
 
															+        headers = self.requests_kwargs.get("headers", {})
														
 
															+        if "user-agent" not in headers and "User-Agent" not in headers:
														
 
															+            if self.render:  # 如果是渲染默认，优先使用WEBDRIVER中配置的ua
														
 
															+                ua = setting.WEBDRIVER.get(
														
 
															+                    "user_agent"
														
 
															+                ) or self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
														
 
															+            else:
														
 
															+                ua = self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
														
 
															+
														
 
															+            if self.random_user_agent and setting.RANDOM_HEADERS:
														
 
															+                headers.update({"User-Agent": ua})
														
 
															+                self.requests_kwargs.update(headers=headers)
														
 
															+        else:
														
 
															+            self.requests_kwargs.setdefault(
														
 
															+                "headers", {"User-Agent": setting.DEFAULT_USERAGENT}
														
 
															+            )
														
 
															+
														
 
															+        # 代理
														
 
															+        proxies = self.requests_kwargs.get("proxies", -1)
														
 
															+        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
														
 
															+            while True:
														
 
															+                proxies = self._proxies_pool.get()
														
 
															+                if proxies:
														
 
															+                    self.requests_kwargs.update(proxies=proxies)
														
 
															+                    break
														
 
															+                else:
														
 
															+                    log.debug("暂无可用代理 ...")
														
 
															+
														
 
															+        log.debug(
														
 
															+            """
														
 
															+                -------------- %srequest for ----------------
														
 
															+                url  = %s
														
 
															+                method = %s
														
 
															+                body = %s
														
 
															+                """
														
 
															+            % (
														
 
															+                ""
														
 
															+                if not self.parser_name
														
 
															+                else "%s.%s "
														
 
															+                % (
														
 
															+                    self.parser_name,
														
 
															+                    (
														
 
															+                        self.callback
														
 
															+                        and callable(self.callback)
														
 
															+                        and getattr(self.callback, "__name__")
														
 
															+                        or self.callback
														
 
															+                    )
														
 
															+                    or "parse",
														
 
															+                ),
														
 
															+                self.url,
														
 
															+                method,
														
 
															+                self.requests_kwargs,
														
 
															+            )
														
 
															+        )
														
 
															+
														
 
															+        # def hooks(response, *args, **kwargs):
														
 
															+        #     print(response.url)
														
 
															+        #
														
 
															+        # self.requests_kwargs.update(hooks={'response': hooks})
														
 
															+
														
 
															+        use_session = (
														
 
															+            setting.USE_SESSION if self.use_session is None else self.use_session
														
 
															+        )  # self.use_session 优先级高
														
 
															+
														
 
															+        if self.render:
														
 
															+            # 使用request的user_agent、cookies、proxy
														
 
															+            user_agent = headers.get("User-Agent") or headers.get("user-agent")
														
 
															+            cookies = self.requests_kwargs.get("cookies")
														
 
															+            print(cookies)
														
 
															+            if cookies and isinstance(cookies, RequestsCookieJar):
														
 
															+                cookies = cookies.get_dict()
														
 
															+
														
 
															+            if not cookies:
														
 
															+                cookie_str = headers.get("Cookie") or headers.get("cookie")
														
 
															+                if cookie_str:
														
 
															+                    cookies = tools.get_cookies_from_str(cookie_str)
														
 
															+
														
 
															+            proxy = None
														
 
															+            if proxies and proxies != -1:
														
 
															+                proxy = proxies.get("http", "").strip("http://") or proxies.get(
														
 
															+                    "https", ""
														
 
															+                ).strip("https://")
														
 
															+
														
 
															+            browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy)
														
 
															+
														
 
															+            try:
														
 
															+                browser.get(self.url)
														
 
															+                if cookies:
														
 
															+                    browser.cookies = cookies
														
 
															+                if self.render_time:
														
 
															+                    tools.delay_time(self.render_time)
														
 
															+
														
 
															+                html = browser.page_source
														
 
															+                response = Response.from_dict(
														
 
															+                    {
														
 
															+                        "url": browser.current_url,
														
 
															+                        "cookies": browser.cookies,
														
 
															+                        "_content": html.encode(),
														
 
															+                        "status_code": 200,
														
 
															+                        "elapsed": 666,
														
 
															+                        "headers": {
														
 
															+                            "User-Agent": browser.execute_script(
														
 
															+                                "return navigator.userAgent"
														
 
															+                            ),
														
 
															+                            "Cookie": tools.cookies2str(browser.cookies),
														
 
															+                        },
														
 
															+                    }
														
 
															+                )
														
 
															+
														
 
															+                response.browser = browser
														
 
															+            except Exception as e:
														
 
															+                self._webdriver_pool.remove(browser)
														
 
															+                raise e
														
 
															+
														
 
															+        elif use_session:
														
 
															+            response = self._session.request(method, self.url, **self.requests_kwargs)
														
 
															+            response = Response(response)
														
 
															+        else:
														
 
															+            response = requests.request(method, self.url, **self.requests_kwargs)
														
 
															+            response = Response(response)
														
 
															+
														
 
															+        if save_cached:
														
 
															+            self.save_cached(response, expire_time=self.__class__.cached_expire_time)
														
 
															+        log.info("requests",extra={"url":response.url,"code":response.status_code})
														
 
															+        return response
														
 
															+
														
 
															+    def proxies(self):
														
 
															+        """
														
 
															+
														
 
															+        Returns: {"https": "https://ip:port", "http": "http://ip:port"}
														
 
															+
														
 
															+        """
														
 
															+        return self.requests_kwargs.get("proxies")
														
 
															+
														
 
															+    def proxy(self):
														
 
															+        """
														
 
															+
														
 
															+        Returns: ip:port
														
 
															+
														
 
															+        """
														
 
															+        proxies = self.proxies()
														
 
															+        if proxies:
														
 
															+            return proxies.get("http", "").strip("http://") or proxies.get(
														
 
															+                "https", ""
														
 
															+            ).strip("https://")
														
 
															+
														
 
															+    def user_agent(self):
														
 
															+        headers = self.requests_kwargs.get("headers")
														
 
															+        if headers:
														
 
															+            return headers.get("user_agent") or headers.get("User-Agent")
														
 
															+
														
 
															+    @property
														
 
															+    def fingerprint(self):
														
 
															+        """
														
 
															+        request唯一表识
														
 
															+        @return:
														
 
															+        """
														
 
															+        url = self.__dict__.get("url", "")
														
 
															+        # url 归一化
														
 
															+        url = tools.canonicalize_url(url)
														
 
															+        args = [url]
														
 
															+
														
 
															+        for arg in ["params", "data", "files", "auth", "cert", "json"]:
														
 
															+            if self.requests_kwargs.get(arg):
														
 
															+                args.append(self.requests_kwargs.get(arg))
														
 
															+
														
 
															+        return tools.get_md5(*args)
														
 
															+
														
 
															+    @property
														
 
															+    def _cache_db(self):
														
 
															+        if not self.__class__.cache_db:
														
 
															+            self.__class__.cache_db = RedisDB()  # .from_url(setting.pika_spider_1_uri)
														
 
															+
														
 
															+        return self.__class__.cache_db
														
 
															+
														
 
															+    @property
														
 
															+    def _cached_redis_key(self):
														
 
															+        if self.__class__.cached_redis_key:
														
 
															+            return (
														
 
															+                f"response_cached:{self.__class__.cached_redis_key}:{self.fingerprint}"
														
 
															+            )
														
 
															+        else:
														
 
															+            return f"response_cached:test:{self.fingerprint}"
														
 
															+
														
 
															+    def save_cached(self, response, expire_time=1200):
														
 
															+        """
														
 
															+        使用redis保存response 用于调试 不用每回都下载
														
 
															+        @param response:
														
 
															+        @param expire_time: 过期时间
														
 
															+        @return:
														
 
															+        """
														
 
															+
														
 
															+        self._cache_db.strset(self._cached_redis_key, response.to_dict, ex=expire_time)
														
 
															+
														
 
															+    def get_response_from_cached(self, save_cached=True):
														
 
															+        """
														
 
															+        从缓存中获取response
														
 
															+        注意：
														
 
															+            属性值为空：
														
 
															+                -raw ： urllib3.response.HTTPResponse
														
 
															+                -connection：requests.adapters.HTTPAdapter
														
 
															+                -history
														
 
															+
														
 
															+            属性含义改变：
														
 
															+                - request 由requests 改为Request
														
 
															+        @param: save_cached 当无缓存 直接下载 下载完是否保存缓存
														
 
															+        @return:
														
 
															+        """
														
 
															+        response_dict = self._cache_db.strget(self._cached_redis_key)
														
 
															+        if not response_dict:
														
 
															+            log.info("无response缓存  重新下载")
														
 
															+            try:
														
 
															+                response_obj = self.get_response(save_cached=save_cached)
														
 
															+            except FunctionTimedOut:
														
 
															+                log.info("请求超时")
														
 
															+                log.info("requests", extra={"url": self.url, "code": 0})
														
 
															+
														
 
															+        else:
														
 
															+            response_dict = eval(response_dict)
														
 
															+            response_obj = Response.from_dict(response_dict)
														
 
															+        return response_obj
														
 
															+
														
 
															+    def del_response_cached(self):
														
 
															+        self._cache_db.clear(self._cached_redis_key)
														
 
															+
														
 
															+    @classmethod
														
 
															+    def from_dict(cls, request_dict):
														
 
															+        for key, value in request_dict.items():
														
 
															+            if isinstance(value, bytes):  # 反序列化 如item
														
 
															+                request_dict[key] = tools.loads_obj(value)
														
 
															+
														
 
															+        return cls(**request_dict)
														
 
															+
														
 
															+    def copy(self):
														
 
															+        return self.__class__.from_dict(self.to_dict)
														
--- a/FworkSpider/feapder/templates/spider_list_template.tmpl
+++ b/FworkSpider/feapder/templates/spider_list_template.tmpl
@@ -2,14 +2,14 @@
 
															 """
														
 
															 Created on {DATE}
														
 
															 ---------
														
 
															-@summary:
														
 
															+@summary: ${spider_name}
														
 
															 ---------
														
 
															 @author: {USER}
														
 
															 """
														
 
															-
														
 
															+import sys
														
 
															+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															 import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem
														
 
															-from untils.proxy_pool import ProxyPool
														
 
															+from items.spider_item import DataBakItem,MgpListItem,ListItem
														
 
															 from feapder.dedup import Dedup
														
 
															 from collections import namedtuple
														
@@ -17,21 +17,20 @@ from collections import namedtuple
 
															 class ${spider_name}(feapder.Spider):
														
 
															     def start_callback(self):
														
 
															-         self.count = 0
														
 
															          Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															          self.menus = [
														
 
															-             Menu('${spider_name}', '${spider_name}', "Notice", 1),
														
 
															-             Menu('${spider_name}', '${spider_name}', "Notice", 1),
														
 
															+             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
														
 
															+             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "Notice", 1),
														
 
															          ]
														
 
															     def start_requests(self):
														
 
															          for menu in self.menus:
														
 
															-            start_url = f''
														
 
															-            yield feapder.Request(url=start_url, item=menu._asdict())
														
 
															+             for page in range(1,menu.crawl_page+1):
														
 
															+                 start_url = f''
														
 
															+                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False)
														
 
															     def parse(self, request, response):
														
 
															         menu = request.item
														
 
															-        self.count += 1   # 一个计数器
														
 
															         dedup = Dedup(Dedup.BloomFilter)
														
 
															         href_list = []
														
 
															         info_list = []
														
@@ -56,15 +55,34 @@ class ${spider_name}(feapder.Spider):
 
															             list_item.parse = "self.detail_get"
														
 
															             list_item.parser_name = "details"
														
 
															             list_item.item = data_item.to_dict
														
 
															-            list_item.xpath = ['//****',"*****"]
														
 
															-            list_item.author = "****"
														
 
															+            list_item.deal_detail = ['//div[@class="****"]',"*****"]
														
 
															+            list_item.proxies = False
														
 
															             list_item.parse_url = href
														
 
															+            list_item.pri = 1
														
 
															+            list.files={
														
 
															+                "list_xpath":'//div[@class="notice-foot"]/a',
														
 
															+                "url_xpath":'./@href',
														
 
															+                "name_xpath":'./text()',
														
 
															+                "files_type":('zip','doxc','ftp'),
														
 
															+                "file_type":'zip',
														
 
															+                "url_key":'attachmentDownload',
														
 
															+                # "host":'http',
														
 
															+                "kwargs":{"headers": {
														
 
															+                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
														
 
															+                }}
														
 
															             href_list.append(href)
														
 
															             yield list_item
														
 
															+        list = ListItem()
														
 
															+        list.site = self.site
														
 
															+        list.channel = menu.get("channel")
														
 
															+        list.spidercode = menu.get("code")
														
 
															+        list.url = request.url
														
 
															+        list.count = len(info_list)
														
 
															+        list.rel_count = len(href_list)
														
 
															         dedup.add(href_list)
														
 
															     def end_callback(self):
														
 
															         print("爬虫结束")
														
 
															 if __name__ == "__main__":
														
 
															-    ${spider_name}(redis_key="fwork:${spider_name}").start()
														
 
															+    ${spider_name}(redis_key="{USER}:${spider_name}").start()
														
--- a/FworkSpider/feapder/templates/spider_template.tmpl
+++ b/FworkSpider/feapder/templates/spider_template.tmpl
@@ -64,4 +64,4 @@ class ${spider_name}(feapder.Spider):
 
															         return request
														
 
															 if __name__ == "__main__":
														
 
															-    ${spider_name}(redis_key="fwork:${spider_name}").start()
														
 
															+    ${spider_name}(redis_key="{USER}:${spider_name}").start()
														
--- a/FworkSpider/feapder/utils/aliyun.py
+++ b/FworkSpider/feapder/utils/aliyun.py
@@ -1,3 +1,12 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+Created on 2021/3/18 12:39 上午
														
 
															+---------
														
 
															+@summary:  阿里云附件上传
														
 
															+---------
														
 
															+@author: Boris
														
 
															+@email: boris_liu@foxmail.com
														
 
															+"""
														
 
															 import hashlib
														
 
															 import os
														
 
															 import traceback
														
@@ -56,7 +65,7 @@ class UploadOSS:
 
															                 else:
														
 
															                     return "{:.1f} kb".format(_kb)
														
 
															-    def get_state(self, attachment, **kwargs):
														
 
															+    def get_state(self, attachment,count=0, **kwargs):
														
 
															         """
														
 
															         下载附件并上传阿里oss
														
@@ -78,7 +87,10 @@ class UploadOSS:
 
															                 if not os.path.exists(img_dir):
														
 
															                     os.makedirs(img_dir, mode=0o777, exist_ok=True)
														
 
															                 # 打开目录,放入下载的附件
														
 
															-                self.file_path = "{}/{}".format(img_dir, attachment["filename"])
														
 
															+                filname = hashlib.md5(attachment["filename"].encode("utf-8"))
														
 
															+                filname = filname.hexdigest() #加密1次
														
 
															+                types = attachment["ftype"]
														
 
															+                self.file_path = "{}/{}".format(img_dir, filname+'.'+types)
														
 
															                 with open(self.file_path, 'wb') as f:
														
 
															                     f.write(self.file_stream)
														
 
															                 # 上传附件
														
@@ -89,13 +101,16 @@ class UploadOSS:
 
															                 # 返回附件上传处理信息
														
 
															                 return file_state
														
 
															             else:
														
 
															-                attachment["ftype"] = str(attachment["filename"]).split(".")[1]
														
 
															-                attachment["url"] = 'oss'
														
 
															-                attachment["fid"] = self.fid + "." + attachment["ftype"]
														
 
															-                attachment["size"] = '0kb'
														
 
															-                attachment["false"] = True
														
 
															-                return attachment
														
 
															-    def post_state(self, attachment, **kwargs):
														
 
															+                if count<3:
														
 
															+                    self.post_state(attachment,count=count+1, **kwargs)
														
 
															+                else:
														
 
															+                    # attachment["ftype"] = str(attachment["filename"]).split(".")[1]
														
 
															+                    attachment["url"] = 'oss'
														
 
															+                    attachment["fid"] = self.fid + "." + attachment["ftype"]
														
 
															+                    attachment["size"] = '0kb'
														
 
															+                    attachment["false"] = True
														
 
															+                    return attachment
														
 
															+    def post_state(self, attachment,count=0, **kwargs):
														
 
															         """
														
 
															         下载附件并上传阿里oss
														
@@ -116,7 +131,10 @@ class UploadOSS:
 
															                 if not os.path.exists(img_dir):
														
 
															                     os.makedirs(img_dir, mode=0o777, exist_ok=True)
														
 
															                 # 打开目录,放入下载的附件
														
 
															-                self.file_path = "{}/{}{}".format(img_dir,time.time(),attachment["filename"])
														
 
															+                filname = hashlib.md5(attachment["filename"].encode("utf-8"))
														
 
															+                filname = filname.hexdigest()  # 加密1次
														
 
															+                types = attachment["ftype"]
														
 
															+                self.file_path = "{}/{}".format(img_dir, filname + '.' + types)
														
 
															                 with open(self.file_path, 'wb') as f:
														
 
															                     f.write(self.file_stream)
														
@@ -128,12 +146,14 @@ class UploadOSS:
 
															                 # 返回附件上传处理信息
														
 
															                 return file_state
														
 
															             else:
														
 
															-                attachment["ftype"] = str(attachment["filename"]).split(".")[1]
														
 
															-                attachment["url"] = 'oss'
														
 
															-                attachment["fid"] = self.fid + "." + attachment["ftype"]
														
 
															-                attachment["size"] = '0kb'
														
 
															-                attachment["false"] = True
														
 
															-                return attachment
														
 
															+                if count<3:
														
 
															+                    self.post_state(attachment,count=count+1, **kwargs)
														
 
															+                else:
														
 
															+                    attachment["url"] = 'oss'
														
 
															+                    attachment["fid"] = self.fid + "." + attachment["ftype"]
														
 
															+                    attachment["size"] = '0kb'
														
 
															+                    attachment["false"] = True
														
 
															+                    return attachment
														
 
															     def put_oss_from_local(self):
														
 
															         """上传一个本地文件到阿里OSS的普通文件"""
														
@@ -148,7 +168,7 @@ class UploadOSS:
 
															         @param attachment: 附件
														
 
															         @return: 附件上传处理信息
														
 
															         """
														
 
															-        attachment["ftype"] = str(attachment["filename"]).split(".")[1]
														
 
															+        # attachment["ftype"] = str(attachment["filename"]).split(".")[1]
														
 
															         attachment["url"] = 'oss'
														
 
															         attachment["fid"] = self.fid + "." + attachment["ftype"]
														
 
															         attachment["size"] = self.file_size
														
--- a/FworkSpider/feapder/utils/email_sender.py
+++ b/FworkSpider/feapder/utils/email_sender.py
@@ -2,7 +2,7 @@
 
															 """
														
 
															 Created on 2020/2/19 12:57 PM
														
 
															 ---------
														
 
															-@summary:
														
 
															+@summary: 邮件发送
														
 
															 ---------
														
 
															 @author: Boris
														
 
															 @email: boris_liu@foxmail.com
														
--- a/FworkSpider/feapder/utils/log.py
+++ b/FworkSpider/feapder/utils/log.py
@@ -10,10 +10,11 @@ Created on 2018-12-08 16:50
 
															 import logging
														
 
															 import os
														
 
															 import sys
														
 
															+import time
														
 
															 from logging.handlers import BaseRotatingHandler
														
 
															-import logstash
														
 
															 import loguru
														
 
															+import pymongo
														
 
															 from better_exceptions import format_exception
														
 
															 import feapder.setting as setting
														
@@ -40,45 +41,47 @@ class RotatingFileHandler(BaseRotatingHandler):
 
															         self.max_bytes = max_bytes
														
 
															         self.backup_count = backup_count
														
 
															         self.placeholder = str(len(str(backup_count)))
														
 
															+        self._to_db = None
														
 
															+        self.filename = filename
														
 
															+
														
 
															+
														
 
															+    @property
														
 
															+    def to_db(self):
														
 
															+        if not self._to_db:
														
 
															+            self._to_db = pymongo.MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
														
 
															+
														
 
															+        return self._to_db.pyspider
														
 
															-    def doRollover(self):
														
 
															-        if self.stream:
														
 
															-            self.stream.close()
														
 
															-            self.stream = None
														
 
															-        if self.backup_count > 0:
														
 
															-            for i in range(self.backup_count - 1, 0, -1):
														
 
															-                sfn = ("%0" + self.placeholder + "d.") % i  # '%2d.'%i -> 02
														
 
															-                sfn = sfn.join(self.baseFilename.split("."))
														
 
															-                # sfn = "%d_%s" % (i, self.baseFilename)
														
 
															-                # dfn = "%d_%s" % (i + 1, self.baseFilename)
														
 
															-                dfn = ("%0" + self.placeholder + "d.") % (i + 1)
														
 
															-                dfn = dfn.join(self.baseFilename.split("."))
														
 
															-                if os.path.exists(sfn):
														
 
															-                    # print "%s -> %s" % (sfn, dfn)
														
 
															-                    if os.path.exists(dfn):
														
 
															-                        os.remove(dfn)
														
 
															-                    os.rename(sfn, dfn)
														
 
															-            dfn = (("%0" + self.placeholder + "d.") % 1).join(
														
 
															-                self.baseFilename.split(".")
														
 
															-            )
														
 
															-            if os.path.exists(dfn):
														
 
															-                os.remove(dfn)
														
 
															-            # Issue 18940: A file may not have been created if delay is True.
														
 
															-            if os.path.exists(self.baseFilename):
														
 
															-                os.rename(self.baseFilename, dfn)
														
 
															-        if not self.delay:
														
 
															-            self.stream = self._open()
														
 
															     def shouldRollover(self, record):
														
 
															+        parmars = {
														
 
															+            "spider_name":record.name,
														
 
															+            "msg":record.msg,
														
 
															+            "Message":str(record.getMessage)
														
 
															+        }
														
 
															+        if record.levelname == "ERROR":
														
 
															+            crawl_type = 'list'
														
 
															+            if 'detail' in record.name:
														
 
															+                crawl_type = 'detail'
														
 
															+            url = ''
														
 
															+            item={
														
 
															+                "recordname":record.name,
														
 
															+                "spidercode":"spidercode",
														
 
															+                "author":self.filename,
														
 
															+                "account":"",
														
 
															+                "crawl_time":time.time(),
														
 
															+                "crawl_type": crawl_type,
														
 
															+                "status_code":"status_code",
														
 
															+                "url":url,
														
 
															+                "reason":record.msg,
														
 
															+                'parmars': parmars,
														
 
															+            }
														
 
															+
														
 
															+            # print('<<<<<<<<<<<<<<<<<<<<<<<插入error_info')
														
 
															+            # print(item)
														
 
															+            # print(self.to_db.error_info)
														
 
															+            # self.to_db.error_info.insert_one(item)
														
 
															-        if self.stream is None:  # delay was set...
														
 
															-            self.stream = self._open()
														
 
															-        if self.max_bytes > 0:  # are we rolling over?
														
 
															-            msg = "%s\n" % self.format(record)
														
 
															-            self.stream.seek(0, 2)  # due to non-posix-compliant Windows feature
														
 
															-            if self.stream.tell() + len(msg) >= self.max_bytes:
														
 
															-                return 1
														
 
															-        return 0
														
 
															 def get_logger(
														
@@ -87,7 +90,6 @@ def get_logger(
 
															     log_level=None,
														
 
															     is_write_to_console=None,
														
 
															     is_write_to_file=None,
														
 
															-    is_send_to_logstash = None,
														
 
															     color=None,
														
 
															     mode=None,
														
 
															     max_bytes=None,
														
@@ -111,7 +113,6 @@ def get_logger(
 
															     @result:
														
 
															     """
														
 
															     # 加载setting里最新的值
														
 
															-    # name = os.path.split(os.getcwd())[-1]
														
 
															     name = name or setting.LOG_NAME
														
 
															     path = path or setting.LOG_PATH
														
 
															     log_level = log_level or setting.LOG_LEVEL
														
@@ -125,11 +126,6 @@ def get_logger(
 
															         if is_write_to_file is not None
														
 
															         else setting.LOG_IS_WRITE_TO_FILE
														
 
															     )
														
 
															-    is_send_to_logstash = (
														
 
															-        is_send_to_logstash
														
 
															-        if is_send_to_logstash is not None
														
 
															-        else setting.LOG_IS_SEND_TO_LOGSTASH
														
 
															-    )
														
 
															     color = color if color is not None else setting.LOG_COLOR
														
 
															     mode = mode or setting.LOG_MODE
														
 
															     max_bytes = max_bytes or setting.LOG_MAX_BYTES
														
@@ -148,8 +144,8 @@ def get_logger(
 
															     # 定义一个RotatingFileHandler，最多备份5个日志文件，每个日志文件最大10M
														
 
															     if is_write_to_file:
														
 
															-        if path and not os.path.exists(os.path.dirname(path)):
														
 
															-            os.makedirs(os.path.dirname(path))
														
 
															+        # if path and not os.path.exists(os.path.dirname(path)):
														
 
															+        #     os.makedirs(os.path.dirname(path))
														
 
															         rf_handler = RotatingFileHandler(
														
 
															             path,
														
@@ -160,8 +156,6 @@ def get_logger(
 
															         )
														
 
															         rf_handler.setFormatter(formatter)
														
 
															         logger.addHandler(rf_handler)
														
 
															-    if is_send_to_logstash:
														
 
															-        logger.addHandler(logstash.TCPLogstashHandler(setting.LOGSTASH_IP, setting.LOGSTASH_PORT, version=1))
														
 
															     if color and is_write_to_console:
														
 
															         loguru_handler = InterceptHandler()
														
 
															         loguru_handler.setFormatter(formatter)
														
--- a/FworkSpider/feapder/utils/redis_lock.py
+++ b/FworkSpider/feapder/utils/redis_lock.py
@@ -107,7 +107,7 @@ class RedisLock:
 
															                 time.sleep(1)
														
 
															                 continue
														
 
															             self.redis_conn.expire(self.lock_key, expire + 5)  # 延长5秒
														
 
															-            time.sleep(5)  # 临过期5秒前，再次延长
														
 
															+            time.sleep(expire)  # 临过期5秒前，再次延长
														
 
															             spend_time += expire
														
 
															             if self.lock_timeout and spend_time > self.lock_timeout:
														
 
															                 log.info("锁超时，释放")
														
--- a/FworkSpider/feapder/utils/tools.py
+++ b/FworkSpider/feapder/utils/tools.py
@@ -7,7 +7,6 @@ Created on 2018-09-06 14:21
 
															 @author: Boris
														
 
															 @email: boris_liu@foxmail.com
														
 
															 """
														
 
															-print('123木头人')
														
 
															 import asyncio
														
 
															 import calendar
														
 
															 import codecs
														
@@ -48,7 +47,6 @@ from w3lib.url import canonicalize_url as _canonicalize_url
 
															 import feapder.setting as setting
														
 
															 from feapder.utils.email_sender import EmailSender
														
 
															 from feapder.utils.log import log
														
 
															-
														
 
															 os.environ["EXECJS_RUNTIME"] = "Node"  # 设置使用node执行js
														
 
															 # 全局取消ssl证书验证
														
@@ -58,8 +56,7 @@ TIME_OUT = 30
 
															 TIMER_TIME = 5
														
 
															 redisdb = None
														
 
															-def ccmu():
														
 
															-    print('sss')
														
 
															+
														
 
															 def get_redisdb():
														
 
															     global redisdb
														
@@ -75,7 +72,7 @@ def get_redisdb():
 
															     return redisdb
														
 
															-# 装饰器
														
 
															+# 装饰器 -- 单例模式
														
 
															 class Singleton(object):
														
 
															     def __init__(self, cls):
														
 
															         self._cls = cls
														
--- a/FworkSpider/feapder/utils/webdriver.py
+++ b/FworkSpider/feapder/utils/webdriver.py
@@ -22,6 +22,7 @@ DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit
 
															 class WebDriver(RemoteWebDriver):
														
 
															+    '''浏览器采集 - selenium'''
														
 
															     CHROME = "CHROME"
														
 
															     PHANTOMJS = "PHANTOMJS"
														
 
															     FIREFOX = "FIREFOX"
														
@@ -111,12 +112,6 @@ class WebDriver(RemoteWebDriver):
 
															             firefox_profile.set_preference('network.proxy.socks', proxy.split(":")[0])
														
 
															             firefox_profile.set_preference('network.proxy.socks_port', int(proxy.split(":")[-1]))
														
 
															             # firefox_capabilities["marionette"] = True  # http代理的使用
														
 
															-            # firefox_capabilities["proxy"] = {
														
 
															-            #     "proxyType": "MANUAL",
														
 
															-            #     "httpProxy": proxy,
														
 
															-            #     "ftpProxy": proxy,
														
 
															-            #     "sslProxy": proxy,
														
 
															-            # }
														
 
															         if self._user_agent:
														
 
															             firefox_profile.set_preference(
														
@@ -279,8 +274,8 @@ class WebDriver(RemoteWebDriver):
 
															         else:
														
 
															             raise AttributeError
														
 
															-    # def __del__(self):
														
 
															-    #     self.quit()
														
 
															+    def __del__(self):
														
 
															+        self.quit()
														
 
															 @Singleton
														
--- a/FworkSpider/mongo_pipeline.py
+++ b/FworkSpider/mongo_pipeline.py
@@ -9,17 +9,16 @@ Created on 2021-04-18 14:12:21
 
															 """
														
 
															 from typing import Dict, List, Tuple
														
 
															 import time
														
 
															-# from feapder.db.mongodb import MongoDB
														
 
															 from feapder.db.redisdb import RedisDB
														
 
															 from feapder.dedup import Dedup
														
 
															 from feapder.pipelines import BasePipeline
														
 
															 from feapder.utils.log import log
														
 
															 from untils.tools import *
														
 
															-# from crawlab import save_item
														
 
															-class MongoPipeline(BasePipeline):
														
 
															+class RedisPipeline(BasePipeline):
														
 
															+    '''数据存储管道-redis版'''
														
 
															     def __init__(self):
														
 
															         self._to_db = None
														
@@ -27,6 +26,7 @@ class MongoPipeline(BasePipeline):
 
															     def to_db(self):
														
 
															         if not self._to_db:
														
 
															             self._to_db = RedisDB()
														
 
															+            print("创建新连接？")
														
 
															         return self._to_db
														
@@ -42,7 +42,7 @@ class MongoPipeline(BasePipeline):
 
															         """
														
 
															         try:
														
 
															             add_count = self.to_db.lpush(table="savemongo:"+table, values=items)
														
 
															-            # add_count = self.to_db.lpop(table="savemongo:"+table, values=items)
														
 
															+            print(add_count)
														
 
															             datas_size = len(items)
														
 
															             log.info(
														
 
															                 "共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条"
														
--- a/FworkSpider/mongo_pipeline_old.py
+++ b/FworkSpider/mongo_pipeline_old.py
@@ -0,0 +1,98 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+Created on 2021-04-18 14:12:21
														
 
															+---------
														
 
															+@summary: 导出数据
														
 
															+---------
														
 
															+@author: 马国鹏
														
 
															+@email:  305021384@qq.com
														
 
															+"""
														
 
															+from typing import Dict, List, Tuple
														
 
															+import time
														
 
															+from feapder.db.mongodb import MongoDB
														
 
															+from feapder.dedup import Dedup
														
 
															+from feapder.pipelines import BasePipeline
														
 
															+from feapder.utils.log import log
														
 
															+from untils.tools import *
														
 
															+# from crawlab import save_item
														
 
															+
														
 
															+
														
 
															+
														
 
															+class MongoPipeline(BasePipeline):
														
 
															+    def __init__(self):
														
 
															+        self._to_db = None
														
 
															+
														
 
															+    @property
														
 
															+    def to_db(self):
														
 
															+        if not self._to_db:
														
 
															+            self._to_db = MongoDB()
														
 
															+            print("创建新连接？")
														
 
															+
														
 
															+        return self._to_db
														
 
															+
														
 
															+    def save_items(self, table, items: List[Dict]) -> bool:
														
 
															+        """
														
 
															+        保存数据
														
 
															+        Args:
														
 
															+            table: 表名
														
 
															+            items: 数据，[{},{},...]
														
 
															+
														
 
															+        Returns: 是否保存成功 True / False
														
 
															+                 若False，不会将本批数据入到去重库，以便再次入库
														
 
															+        """
														
 
															+        try:
														
 
															+            add_count = self.to_db.add_batch(coll_name=table, datas=items)
														
 
															+            for item in items:
														
 
															+                dedup = Dedup(Dedup.BloomFilter)
														
 
															+                dedup.add([item.get("href")])
														
 
															+                # save_item({'count':item.get("href")})
														
 
															+            datas_size = len(items)
														
 
															+            log.info(
														
 
															+                "共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条"
														
 
															+                % (datas_size, table, add_count, datas_size - add_count)
														
 
															+            )
														
 
															+            # wechat_warning(f"{site}  数据导报\n共插入 {datas_size} 条数据到 {table}")
														
 
															+            # for i in range(add_count):
														
 
															+            # if table == "mgp_list":
														
 
															+            #     save_item({"site": "失败回填", "title": add_count})
														
 
															+
														
 
															+            return True
														
 
															+        except Exception as e:
														
 
															+            log.exception(e)
														
 
															+            return False
														
 
															+
														
 
															+    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
														
 
															+        """
														
 
															+        更新数据
														
 
															+        Args:
														
 
															+            table: 表名
														
 
															+            items: 数据，[{},{},...]
														
 
															+            update_keys: 更新的字段, 如 ("title", "publish_time")
														
 
															+
														
 
															+        Returns: 是否更新成功 True / False
														
 
															+                 若False，不会将本批数据入到去重库，以便再次入库
														
 
															+
														
 
															+        """
														
 
															+        try:
														
 
															+            # self.to_db.find()
														
 
															+            add_count = self.to_db.add_batch(
														
 
															+                coll_name=table,
														
 
															+                datas=items,
														
 
															+                update_columns=update_keys or list(items[0].keys()),
														
 
															+            )
														
 
															+            datas_size = len(items)
														
 
															+            update_count = datas_size - add_count
														
 
															+            msg = "共导出 %s 条数据到 %s,  新增 %s 条, 更新 %s 条" % (
														
 
															+                datas_size,
														
 
															+                table,
														
 
															+                add_count,
														
 
															+                update_count,
														
 
															+            )
														
 
															+            if update_keys:
														
 
															+                msg += " 更新字段为 {}".format(update_keys)
														
 
															+            log.info(msg)
														
 
															+
														
 
															+            return True
														
 
															+        except Exception as e:
														
 
															+            log.exception(e)
														
 
															+            return False
														
--- a/FworkSpider/setting.py
+++ b/FworkSpider/setting.py
@@ -24,11 +24,11 @@ MONGO_DB = "py_spider"
 
															 # # REDIS
														
 
															 # # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
														
 
															 # REDISDB_IP_PORTS = "192.168.20.51:6379"  # 本地 docker 环境
														
 
															-REDISDB_IP_PORTS = "172.19.0.1:6379"  # 本地环境
														
 
															+REDISDB_IP_PORTS = "172.19.0.1:6379"  # 环境
														
 
															 # REDISDB_USER_PASS = ""
														
 
															 REDISDB_DB = 10
														
 
															 # # 适用于redis哨兵模式
														
 
															-REDISDB_SERVICE_NAME = "quchoong"
														
 
															+REDISDB_SERVICE_NAME = "quchoong"  # 没用到
														
 
															 #
														
 
															 # # 数据入库的pipeline，可自定义，默认MysqlPipeline
														
 
															 ITEM_PIPELINES = [
														
@@ -44,9 +44,9 @@ EXPORT_DATA_MAX_RETRY_TIMES = 5 # 导出数据时最大的重试次数，包括
 
															 # COLLECTOR_SLEEP_TIME = 1  # 从任务队列中获取任务到内存队列的间隔
														
 
															 # COLLECTOR_TASK_COUNT = 10  # 每次获取任务数量
														
 
															 #
														
 
															-REDIS_KEY = "fwork"
														
 
															+REDIS_KEY = "fwork" # 没用到
														
 
															 # # SPIDER
														
 
															-# SPIDER_THREAD_COUNT = 4  # 爬虫并发数
														
 
															+SPIDER_THREAD_COUNT = 1  # 爬虫并发数
														
 
															 # SPIDER_SLEEP_TIME = [2, 5] # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数，包含2和5
														
 
															 # SPIDER_TASK_COUNT = 1  # 每个parser从内存队列中获取任务的数量
														
 
															 SPIDER_MAX_RETRY_TIMES = 5  # 每个请求最大重试次数
														
@@ -123,7 +123,8 @@ WECHAT_WARNING_PHONE = "马国鹏"  # 报警人 将会在群内@此人, 支持
 
															 WECHAT_WARNING_ALL = True  # 是否提示所有人， 默认为False
														
 
															 # # 时间间隔
														
 
															 WARNING_INTERVAL = 360  # 相同报警的报警时间间隔，防止刷屏; 0表示不去重
														
 
															-WARNING_LEVEL = "DEBUG"  # 报警级别， DEBUG / ERROR
														
 
															+# WARNING_LEVEL = "DEBUG"  # 报警级别， DEBUG / ERROR
														
 
															+WARNING_LEVEL = "INFO"  # 报警级别， DEBUG / ERROR
														
 
															 WARNING_FAILED_COUNT = 2  # 任务失败数 超过WARNING_FAILED_COUNT则报警
														
 
															 #
														
 
															 #LOG_NAME = os.path.basename(os.getcwd())
														
@@ -134,19 +135,47 @@ LOG_PATH = "log/%s/%s.log" %(DTIME,LOG_NAME)  # log存储路径
 
															 LOG_LEVEL = "INFO"
														
 
															 LOG_COLOR = True  # 是否带有颜色
														
 
															 LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
														
 
															-LOG_IS_WRITE_TO_FILE = False  # 是否写文件
														
 
															-LOG_MODE = "w"  # 写文件的模式
														
 
															+# LOG_IS_WRITE_TO_FILE = True  # 是否写文件
														
 
															+# LOG_MODE = "w"  # 写文件的模式
														
 
															 LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
														
 
															 LOG_BACKUP_COUNT = 20  # 日志文件保留数量
														
 
															 LOG_ENCODING = "utf8"  # 日志文件编码
														
 
															-OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
														
 
															+OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级 一般用不到
														
 
															 #
														
 
															 # # 切换工作路径为当前项目路径
														
 
															 # project_path = os.path.abspath(os.path.dirname(__file__))
														
 
															 # os.chdir(project_path)  # 切换工作路经
														
 
															 # sys.path.insert(0, project_path)
														
 
															 # print('当前工作路径为 ' + os.getcwd())
														
 
															+
														
 
															+# 代理服务-未解析的
														
 
															 jy_proxy = {'socks5': {'url': 'http://socks.spdata.jianyu360.com/socks/getips?limit=100', 'decrypt': 'ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/'}}
														
 
															+
														
 
															 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', 'Accept': '*/*'}
														
 
															-oss_={'key_id': 'LTAI4G5x9aoZx8dDamQ7vfZi', 'key_secret': 'Bk98FsbPYXcJe72n1bG3Ssf73acuNh', 'endpoint': 'oss-cn-beijing-internal.aliyuncs.com', 'bucket_name': 'jy-datafile'}
														
 
															-author = {"dzr":"董钊瑞",'mgp':"马国鹏","lzz":"李宗泽"}
														
 
															+
														
 
															+# 文件存储功能的配置信息
														
 
															+oss_={'key_id': 'LTAI4G5x9aoZx8dDamQ7vfZi', 'key_secret': 'Bk98FsbPYXcJe72n1bG3Ssf73acuNh',
														
 
															+      'endpoint': 'oss-cn-beijing.aliyuncs.com', 'bucket_name': 'jy-datafile'}
														
 
															+# oss_={'key_id': 'LTAI4G5x9aoZx8dDamQ7vfZi', 'key_secret': 'Bk98FsbPYXcJe72n1bG3Ssf73acuNh', 'endpoint': 'oss-cn-beijing-internal.aliyuncs.com', 'bucket_name': 'jy-editor'}
														
 
															+
														
 
															+author = {"dzr":"董钊瑞",'mgp':"马国鹏","lzz":"李宗泽"}
														
 
															+
														
 
															+# 线上代理服务的api地址
														
 
															+JIANYU_PROXY_URL = 'http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch'
														
 
															+JIANYU_PROXY_AUTHOR = 'Basic amlhbnl1MDAxOjEyM3F3ZSFB'
														
 
															+
														
 
															+# splash 渲染服务的api接口配置
														
 
															+JIANYU_SPLASH_URL = 'http://8.131.72.226:8998/render.json'
														
 
															+
														
 
															+# 测试环境的redis集群 -- url去重专用
														
 
															+REDISCLUSTER =  [
														
 
															+                {"host": "192.168.3.207", "port": "2179"},
														
 
															+                {"host": "192.168.3.166", "port": "2379"}
														
 
															+            ]
														
 
															+
														
 
															+# 正式环境的redis集群 -- url去重专用
														
 
															+# REDISCLUSTER =  [
														
 
															+#                 {"host": "172.17.4.239", "port": "2479"},
														
 
															+#                 {"host": "172.17.4.240", "port": "2579"},
														
 
															+#                 {"host": "172.17.4.84", "port": "2379"}
														
 
															+#             ]
														
--- a/FworkSpider/untils/attachment.py
+++ b/FworkSpider/untils/attachment.py
@@ -1,72 +1,22 @@
 
															 import hashlib
														
 
															 import os
														
 
															-import re
														
 
															+import sys
														
 
															 import traceback
														
 
															 import uuid
														
 
															-from urllib.parse import urlparse, unquote
														
 
															-
														
 
															+from urllib import request
														
 
															 import requests
														
 
															 import urllib3
														
 
															-
														
 
															 from feapder.setting import headers
														
 
															 from untils.execptions import AttachmentNullError
														
 
															 from untils.aliyun import AliYunService
														
 
															 from untils.proxy_pool import ProxyPool
														
 
															-
														
 
															+import time
														
 
															+import tqdm
														
 
															 urllib3.disable_warnings()
														
 
															-
														
 
															-
														
 
															-def hex_sha1(val):
														
 
															-    sha1 = hashlib.sha1()
														
 
															-    if isinstance(val, bytes):
														
 
															-        sha1.update(str(val).encode("utf-8"))
														
 
															-    elif isinstance(val, str):
														
 
															-        sha1.update(val.encode("utf-8"))
														
 
															-    res = sha1.hexdigest()
														
 
															-    return res
														
 
															-
														
 
															-
														
 
															-def extract_file_type(text):
														
 
															-    if text is None:
														
 
															-        return None
														
 
															-
														
 
															-    file_types = {
														
 
															-        'pdf', 'doc', 'docx', 'rar', 'zip', 'gzzb', 'jpg', 'png'
														
 
															-    }
														
 
															-    for file_type in file_types:
														
 
															-        tmp = [file_type, file_type.upper()]
														
 
															-        for t in tmp:
														
 
															-            result = re.match(f'.*{t}$', text, re.S)
														
 
															-            if result is not None:
														
 
															-                return t
														
 
															-    else:
														
 
															-        return None
														
 
															-
														
 
															-
														
 
															-def extract_file_name(href: str, file_type: str):
														
 
															-    # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
														
 
															-    # 中文字符:[\u4e00 -\u9fa5]
														
 
															-    zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
														
 
															-    parser = urlparse(href)
														
 
															-    query = (parser.query or parser.path)
														
 
															-    result = re.search(f'.*\\.{file_type}', query, re.S)
														
 
															-    if result is not None:
														
 
															-        encode_str = unquote(result.group())
														
 
															-        name = re.search(zh_char_pattern, encode_str)
														
 
															-        if name is not None:
														
 
															-            return unquote(name.group())
														
 
															-    return None
														
 
															-
														
 
															-
														
 
															-def verify_file_name(name):
														
 
															-    if extract_file_type(name) is None:
														
 
															-        raise ValueError
														
 
															-
														
 
															-
														
 
															 class AttachmentDownloader:
														
 
															-
														
 
															+    '''附件下载模块'''
														
 
															     def __init__(self):
														
 
															-        self.dir_name = '/file'
														
 
															+        self.dir_name = 'file'
														
 
															     def create_dir(self):
														
 
															         if not os.path.exists(self.dir_name):
														
@@ -74,13 +24,52 @@ class AttachmentDownloader:
 
															     def create_file_path(self, filename, file_type):
														
 
															         self.create_dir()
														
 
															-        sign = hex_sha1("{}_{}".format(filename, uuid.uuid4()))
														
 
															+        sign = self.hex_sha1("{}_{}".format(filename, uuid.uuid4()))
														
 
															         tmp_name = "{}.{}".format(sign, file_type)
														
 
															         return "{}/{}".format(self.dir_name, tmp_name)
														
 
															+    def hex_sha1(self,val):
														
 
															+        sha1 = hashlib.sha1()
														
 
															+        if isinstance(val, bytes):
														
 
															+            sha1.update(str(val).encode("utf-8"))
														
 
															+        elif isinstance(val, str):
														
 
															+            sha1.update(val.encode("utf-8"))
														
 
															+        res = sha1.hexdigest()
														
 
															+        return res
														
 
															+
														
 
															     @staticmethod
														
 
															     def create_fid(file_stream: bytes):
														
 
															-        return hex_sha1(file_stream)
														
 
															+        sha1 = hashlib.sha1()
														
 
															+        if isinstance(file_stream, bytes):
														
 
															+            sha1.update(str(file_stream).encode("utf-8"))
														
 
															+        elif isinstance(file_stream, str):
														
 
															+            sha1.update(file_stream.encode("utf-8"))
														
 
															+        res = sha1.hexdigest()
														
 
															+        return res
														
 
															+
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def clean_attachment(file_path):
														
 
															+        os.remove(file_path)
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def getsize(file_path: str):
														
 
															+        def _getsize(filename):
														
 
															+            try:
														
 
															+                return os.path.getsize(filename)
														
 
															+            except:
														
 
															+                return 0
														
 
															+
														
 
															+        _kb = float(_getsize(file_path)) / 1024
														
 
															+        if _kb >= 1024:
														
 
															+            _M = _kb / 1024
														
 
															+            if _M >= 1024:
														
 
															+                _G = _M / 1024
														
 
															+                return "{:.1f} G".format(_G)
														
 
															+            else:
														
 
															+                return "{:.1f} M".format(_M)
														
 
															+        else:
														
 
															+            return "{:.1f} kb".format(_kb)
														
 
															     @staticmethod
														
 
															     def _fetch_attachment(
														
@@ -94,20 +83,28 @@ class AttachmentDownloader:
 
															         request_params.setdefault('headers', kwargs.get('headers') or headers)
														
 
															         request_params.setdefault('proxies', kwargs.get('proxies'))
														
 
															         request_params.setdefault('timeout', kwargs.get('timeout') or 60)
														
 
															-        request_params.setdefault('stream', kwargs.get('stream') or True)
														
 
															+        # request_params.setdefault('stream', kwargs.get('stream') or True)
														
 
															         request_params.setdefault('verify', kwargs.get('verify') or False)
														
 
															         if enable_proxy:
														
 
															-            proxy = ProxyPool()
														
 
															+            proxy = ProxyPool().get()
														
 
															         else:
														
 
															             proxy = {}
														
 
															         retries = 0
														
 
															         while retries < 3:
														
 
															             try:
														
 
															-                with requests.get(url, **request_params) as req:
														
 
															+                with requests.get(url,stream=True, **request_params) as req:
														
 
															+                    content_size = req.headers.get('Content-Length') or 0
														
 
															+                    content_size = int(content_size)
														
 
															+                    stream = b''
														
 
															                     if req.status_code == 200:
														
 
															-                        stream = req.content
														
 
															                         with open(file_path, 'wb') as f:
														
 
															-                            f.write(stream)
														
 
															+                            with tqdm.tqdm(total=content_size, unit='B', initial=0, unit_scale=True, unit_divisor=1024,
														
 
															+                                      ascii=True,desc=file_path) as bar:
														
 
															+                                for chunk in req.iter_content(chunk_size=1024*20):
														
 
															+                                    if chunk:
														
 
															+                                        f.write(chunk)
														
 
															+                                    stream += chunk
														
 
															+                                    bar.update(len(chunk))
														
 
															                         return stream
														
 
															                     else:
														
 
															                         retries += 1
														
@@ -115,33 +112,10 @@ class AttachmentDownloader:
 
															                 if allow_show_exception:
														
 
															                     traceback.print_exc()
														
 
															                 if enable_proxy:
														
 
															-                    request_params.update({'proxies': proxy.get()})
														
 
															+                    request_params.update({'proxies': ProxyPool().get()})
														
 
															                 retries += 1
														
 
															         return b''
														
 
															-    @staticmethod
														
 
															-    def clean_attachment(file_path):
														
 
															-        os.remove(file_path)
														
 
															-
														
 
															-    @staticmethod
														
 
															-    def getsize(file_path: str):
														
 
															-        def _getsize(filename):
														
 
															-            try:
														
 
															-                return os.path.getsize(filename)
														
 
															-            except:
														
 
															-                return 0
														
 
															-
														
 
															-        _kb = float(_getsize(file_path)) / 1024
														
 
															-        if _kb >= 1024:
														
 
															-            _M = _kb / 1024
														
 
															-            if _M >= 1024:
														
 
															-                _G = _M / 1024
														
 
															-                return "{:.1f} G".format(_G)
														
 
															-            else:
														
 
															-                return "{:.1f} M".format(_M)
														
 
															-        else:
														
 
															-            return "{:.1f} kb".format(_kb)
														
 
															-
														
 
															     def fetch_attachment(
														
 
															             self,
														
 
															             file_name: str,
														
@@ -153,7 +127,6 @@ class AttachmentDownloader:
 
															     ):
														
 
															         if not file_name or not file_type or not download_url:
														
 
															             raise AttachmentNullError
														
 
															-
														
 
															         file_path = self.create_file_path(file_name, file_type)
														
 
															         file_stream = self._fetch_attachment(
														
 
															             download_url,
														
@@ -162,6 +135,7 @@ class AttachmentDownloader:
 
															             allow_request_exception,
														
 
															             **kwargs
														
 
															         )
														
 
															+        # file_stream = self.download_file(download_url,file_path,enable_proxy,allow_request_exception)
														
 
															         if len(file_stream) > 0:
														
 
															             fid = self.create_fid(file_stream)
														
 
															             '''上传/下载,无论失败成功都需要给出文件基础信息'''
														
@@ -188,11 +162,83 @@ class AttachmentDownloader:
 
															             }
														
 
															         return result
														
 
															+    def download_file(self, url, file_path, call_func=None,enable_proxy=False,data=None):
														
 
															+        """
														
 
															+        Args:
														
 
															+            url: 地址
														
 
															+            file_path: 文件存储地址
														
 
															+            call_func: 下载成功的回调
														
 
															+        Returns:
														
 
															+        """
														
 
															+        # proxies = kwargs.get('proxies') or None
														
 
															+        # data = kwargs.get('data') or None
														
 
															+        start_time = time.time()
														
 
															+        def progress_callfunc(blocknum, blocksize, totalsize):
														
 
															+            """回调函数
														
 
															+            @blocknum : 已经下载的数据块
														
 
															+            @blocksize : 数据块的大小
														
 
															+            @totalsize: 远程文件的大小
														
 
															+            """
														
 
															+            speed = (blocknum * blocksize) / (time.time() - start_time)
														
 
															+            # speed_str = " Speed: %.2f" % speed
														
 
															+            speed_str = " Speed: %s" % format_size(speed)
														
 
															+            recv_size = blocknum * blocksize
														
 
															+
														
 
															+            # 设置下载进度条
														
 
															+            f = sys.stdout
														
 
															+            pervent = recv_size / totalsize
														
 
															+            percent_str = "%.2f%%" % (pervent * 100)
														
 
															+            n = round(pervent * 50)
														
 
															+            s = ('#' * n).ljust(50, '-')
														
 
															+            f.write(percent_str.ljust(8, ' ') + '[' + s + ']' + speed_str)
														
 
															+            f.flush()
														
 
															+            f.write('\r')
														
 
															+
														
 
															+        def format_size(bytes):
														
 
															+            try:
														
 
															+                bytes = float(bytes)
														
 
															+                kb = bytes / 1024
														
 
															+            except:
														
 
															+                print("传入的字节格式不对")
														
 
															+                return "Error"
														
 
															+            if kb >= 1024:
														
 
															+                M = kb / 1024
														
 
															+                if M >= 1024:
														
 
															+                    G = M / 1024
														
 
															+                    return "%.3fG" % (G)
														
 
															+                else:
														
 
															+                    return "%.3fM" % (M)
														
 
															+            else:
														
 
															+                return "%.3fK" % (kb)
														
 
															+
														
 
															+        if url:
														
 
															+            try:
														
 
															+                if enable_proxy:
														
 
															+                    proxies = ProxyPool().get()
														
 
															+                    # create the object, assign it to a variable
														
 
															+                    proxy = request.ProxyHandler(proxies)
														
 
															+                    # construct a new opener using your proxy settings
														
 
															+                    opener = request.build_opener(proxy)
														
 
															+                    # install the openen on the module-level
														
 
															+                    request.install_opener(opener)
														
 
															+                # 测试可以打开进度条，生产环境禁用进度条
														
 
															+                filename, headers = request.urlretrieve(url, file_path, progress_callfunc, data)
														
 
															+                # filename, headers = request.urlretrieve(url, file_path, data)
														
 
															+                print(filename,headers)
														
 
															+
														
 
															+                if callable(call_func):
														
 
															+                    call_func()
														
 
															+                return filename
														
 
															+            except Exception as e:
														
 
															+                print(e)
														
 
															+                return ''
														
 
															+        else:
														
 
															+            return ''
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															-# if __name__ == '__main__':
														
 
															-    # a = AttachmentDownloader().fetch_attachment(
														
 
															-    #     file_name='成建制移民村（五标段）合同',
														
 
															-    #     file_type='pdf',
														
 
															-    #     download_url='http://222.75.70.90/NXGPPSP_MG/downloadFileServlet?req=F&num=8b80b23f7e729b88017e758a1b03422c'
														
 
															-    # )
														
 
															-    # print(a)
														
 
															+    url = 'https://gdgpo.czt.gd.gov.cn/gpx-bid-file/440606/gpx-tender/2022/5/9/8a7e15d780a438400180a6be91e90cb2.zip?accessCode=0cf1d12a48345bcb7e64ac9583e30207'
														
 
															+    attachment = AttachmentDownloader().fetch_attachment(
														
 
															+        file_name="file_name", file_type="pdf", download_url=url,
														
 
															+        enable_proxy=False)
														
 
															+    print(attachment)
														
--- a/FworkSpider/untils/cleaner.py
+++ b/FworkSpider/untils/cleaner.py
@@ -0,0 +1,136 @@
 
															+import re
														
 
															+__all__ = ['cleaner']
														
 
															+
														
 
															+# 独立元素
														
 
															+INDEPENDENT_TAGS = {
														
 
															+    '<head>[\s\S]*?</head>': '',
														
 
															+    '<html>|<html [^>]*>|</html>': '',
														
 
															+    '<body>|<body [^>]*>|</body>': '',
														
 
															+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
														
 
															+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
														
 
															+    '\\xa0|\\u3000': '',  # 空格
														
 
															+    '<!--[\s\S]*?-->': '',  # 注释
														
 
															+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
														
 
															+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
														
 
															+    '<input>': '',  # 输入框
														
 
															+    '<img[^>]*>': '<br>',  # 图片
														
 
															+}
														
 
															+# 行内元素
														
 
															+INLINE_TAGS = {
														
 
															+    '<a>|<a [^>]*>|</a>': '',  # 超链接
														
 
															+    '<link>|<link [^>]*>|</link>': '',  # 超链接
														
 
															+    '<span>|<span [^>]*>|</span>': '',  # span
														
 
															+    '<label>|<label [^>]*>|</label>': '<br>',  # label
														
 
															+    '<font>|<font [^>]*>|</font>': '',  # font
														
 
															+}
														
 
															+# 块级元素
														
 
															+BLOCK_TAGS = {
														
 
															+    '<div>\s*?</div>':'',
														
 
															+    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
														
 
															+    '<p>|<p [^>]*>': '<br>',  # 段落
														
 
															+    '</p>': '',  # 段落
														
 
															+    '<div>|<div [^>]*>': '<br>',  # 分割 division
														
 
															+    '</div>': '',  # 分割 division
														
 
															+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
														
 
															+}
														
 
															+# 其他
														
 
															+OTHER = {
														
 
															+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
														
 
															+    '<epointform>': '',
														
 
															+    '<!doctype html>|<!doctype html [^>]*>': '',
														
 
															+    '【关闭】|关闭': '',
														
 
															+    '【打印】|打印本页': '',
														
 
															+    '【字体：[\s\S]*】': '',
														
 
															+    '文章来源：[\u4e00-\u9fa5]+': '',
														
 
															+    '浏览次数：.*[<]+': '',
														
 
															+    '（责任编辑：.*?）': '',
														
 
															+    '分享到[：]': '',
														
 
															+
														
 
															+}
														
 
															+# 样式
														
 
															+CSS_STYLE = {
														
 
															+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
														
 
															+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
														
 
															+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
														
 
															+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
														
 
															+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
														
 
															+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
														
 
															+
														
 
															+}
														
 
															+# 空白符
														
 
															+BLANKS = {
														
 
															+    '\n\s*\n': '\n',
														
 
															+    '\s*\n\s*': '\n',
														
 
															+    '[^\S\n]': ' ',
														
 
															+    '\s+': ' ',
														
 
															+}
														
 
															+# css标签集合
														
 
															+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
														
 
															+# css属性集合
														
 
															+ATTRS = {'id', 'class', 'style', 'width'}
														
 
															+
														
 
															+
														
 
															+def _repair_tag():
														
 
															+    """异常的标签组合,用来替换非标准页面的标签"""
														
 
															+    _repairs = {}
														
 
															+    for tag in TAGS:
														
 
															+        for attr in ATTRS:
														
 
															+            key = '{}{}'.format(tag, attr)
														
 
															+            val = '{} {}'.format(tag, attr)
														
 
															+            _repairs[key] = val
														
 
															+    return _repairs
														
 
															+
														
 
															+
														
 
															+def _escape_character(html):
														
 
															+    """转义字符"""
														
 
															+    html = html.replace('&lt;', '<')
														
 
															+    html = html.replace('&gt;', '>')
														
 
															+    html = html.replace('&quot;', '"')
														
 
															+    html = html.replace('&amp;', '&')
														
 
															+    return html
														
 
															+
														
 
															+
														
 
															+def _lowercase_tag(html):
														
 
															+    """标签归一化处理（全部小写）"""
														
 
															+    tags = re.findall("<[^>]+>", html)
														
 
															+    for tag in tags:
														
 
															+        html = html.replace(tag, str(tag).lower())
														
 
															+
														
 
															+    repair_tags = _repair_tag()
														
 
															+    for err, right in repair_tags.items():
														
 
															+        html = html.replace(err, right)
														
 
															+
														
 
															+    return html
														
 
															+
														
 
															+
														
 
															+def cleaner(html, special=None, completely=False):
														
 
															+    """
														
 
															+    数据清洗
														
 
															+
														
 
															+    :param html: 清洗的页面
														
 
															+    :param special: 额外指定页面清洗规则
														
 
															+    :param completely: 是否完全清洗页面
														
 
															+    :return: 清洗后的页面源码
														
 
															+    """
														
 
															+    if special is None:
														
 
															+        special = {}
														
 
															+    OTHER.update(special)
														
 
															+    remove_tags = {
														
 
															+        **INDEPENDENT_TAGS,
														
 
															+        **INLINE_TAGS,
														
 
															+        **BLOCK_TAGS,
														
 
															+        **OTHER,
														
 
															+        **CSS_STYLE,
														
 
															+        **BLANKS,
														
 
															+    }
														
 
															+    html = _lowercase_tag(html)
														
 
															+    for tag, repl in remove_tags.items():
														
 
															+        html = re.sub(tag, repl, html)
														
 
															+
														
 
															+    if completely:
														
 
															+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
														
 
															+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
														
 
															+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
														
 
															+
														
 
															+    html = _escape_character(html)
														
 
															+    return html
														
--- a/FworkSpider/untils/cookie_pool.py
+++ b/FworkSpider/untils/cookie_pool.py
@@ -131,6 +131,7 @@ class PageCookiePool(CookiePoolInterface):
 
															                         )
														
 
															                     )
														
 
															                     try:
														
 
															+                        print('????')
														
 
															                         cookies = self.create_cookie()
														
 
															                         if cookies:
														
 
															                             self.add_cookies(cookies)
														
@@ -178,7 +179,7 @@ class PageCookiePool(CookiePoolInterface):
 
															             try:
														
 
															                 cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool)
														
 
															                 if not cookie_info and wait_when_null:
														
 
															-                    log.info("暂无cookie 生产中..."+self._tab_cookie_pool)
														
 
															+                    log.info("暂无cookie 生产中...")
														
 
															                     self._keep_alive = False
														
 
															                     self._min_cookies = 1
														
 
															                     with RedisLock(
														
@@ -291,7 +292,7 @@ class LoginCookiePool(CookiePoolInterface):
 
															             try:
														
 
															                 user_cookie = self._redisdb.rpoplpush(self._tab_cookie_pool)
														
 
															                 if not user_cookie and wait_when_null:
														
 
															-                    log.info("暂无cookie 生产中..."+self._tab_cookie_pool)
														
 
															+                    log.info("暂无cookie 生产中...")
														
 
															                     self.login()
														
 
															                     continue
														
--- a/FworkSpider/untils/create_menus.py
+++ b/FworkSpider/untils/create_menus.py
@@ -19,7 +19,7 @@ class Details:
 
															         return self._to_db_xs
														
 
															     def main(self,page):
														
 
															         menus_list = []
														
 
															-        data = self.to_db_xs.find("luaconfig",{"modifyuser":"maguopeng","param_common":{"$elemMatch": {"$regex": "中国南方航空采购招标网", "$options": "$i"}}})
														
 
															+        data = self.to_db_xs.find("luaconfig",{"modifyuser":"maguopeng","param_common":{"$elemMatch": {"$regex": "广东省政府采购网", "$options": "$i"}}})
														
 
															         # print(data)
														
 
															         for item in data:
														
 
															             # print(item)
														
--- a/FworkSpider/untils/get_imgcode.py
+++ b/FworkSpider/untils/get_imgcode.py
@@ -12,3 +12,10 @@ def get_code(file_path: str) -> dict:
 
															     response = requests.post(upload_address, headers=headers, files=content, stream=True)
														
 
															     return response.json()
														
 
															+def get_code_det(image_bytes) -> dict:
														
 
															+   upload_address = "http://123.57.163.80:2119/v1/images/verify_det"
														
 
															+   content = {'image_content': image_bytes}
														
 
															+   headers = {'accept': 'application/json'}
														
 
															+   response = requests.post(upload_address, headers=headers, files=content, stream=True)
														
 
															+   return response.json()
														
 
															+
														
--- a/FworkSpider/untils/tools.py
+++ b/FworkSpider/untils/tools.py
@@ -7,82 +7,15 @@ from setting import WECHAT_WARNING_URL,WECHAT_WARNING_PHONE,WARNING_INTERVAL,WEC
 
															 import bson
														
 
															 from feapder.utils.log import log
														
 
															 from feapder.db.mongodb import MongoDB
														
 
															-
														
 
															+from .cleaner import cleaner
														
 
															+import sys
														
 
															 SearchText = namedtuple('SearchText', ['total'])
														
 
															-def substitute(html_str):
														
 
															+def substitute(html_str,special=None, completely=False):
														
 
															     """HTML 替换"""
														
 
															-    patterns = {
														
 
															-        '<!--.*?-->': '',
														
 
															-        '"': "'",
														
 
															-        '\n': '',
														
 
															-        '\xa0': "",
														
 
															-        '<span .*?>': '',
														
 
															-        '</span> ': '',
														
 
															-        '</span>': '',
														
 
															-        '<span>': '',
														
 
															-        '<p.*?>': '<br>',
														
 
															-        '</p>': '<br>',
														
 
															-        '<div>': '<br>',
														
 
															-        '<div .*?>': '<br>',
														
 
															-        '</div>': '<br>',
														
 
															-        '<img .*?>': '<br>',
														
 
															-        '<style.*?</style>': '',
														
 
															-        '<EpointForm>': '',
														
 
															-        '<html.*?</head>': '',
														
 
															-        '<input .*?>': '',
														
 
															-        '<!DOCTYPE.*?>': '',
														
 
															-        '</meta>': '',
														
 
															-        '<?xml:.*?>': '',
														
 
															-        '<label.*?>': '<br>',
														
 
															-        '</label>': '',
														
 
															-        'style=".*?"': '',
														
 
															-        "style='.*?'": '',
														
 
															-        'class=".*?"': '',
														
 
															-        "class='.*?'": '',
														
 
															-        "align='.*?'": '',
														
 
															-        'align=".*?"': '',
														
 
															-        'border=".*?"': '',
														
 
															-        "border='.*?'": '',
														
 
															-        'cellpadding=".*?"': '',
														
 
															-        "cellpadding='.*?'": '',
														
 
															-        'cellspacing=".*?"': '',
														
 
															-        "cellspacing='.*?'": '',
														
 
															-        'center=".*?"': '',
														
 
															-        "center='.*?'": '',
														
 
															-        'width=".*?"': '',
														
 
															-        "width='.*?'": '',
														
 
															-        "bordercolor='.*?'": '',
														
 
															-        'bgcolor=".*?"': '',
														
 
															-        'BORDERCOLOR=".*?"': '',
														
 
															-        '<a name=".*?">': '',
														
 
															-        '<o:p>': '',
														
 
															-        '</o:p>': '',
														
 
															-        '<A name=.*?>': '',
														
 
															-        '<a .*?>': '',
														
 
															-        '</a>': '',
														
 
															-        '<font .*?>': '',
														
 
															-        '</font>': '',
														
 
															-        '<body.*?>': '',
														
 
															-        '</body>': '',
														
 
															-        '<script.*?>': '',
														
 
															-        '</script>': '',
														
 
															-        '【关闭】': '',
														
 
															-        '【打印】': '',
														
 
															-        'function .*?() ': '',
														
 
															-        'var .*?;': '',
														
 
															-        'if .*?\)': '',
														
 
															-        '{[^{}]+}': '',
														
 
															-        '{.*?}': '',
														
 
															-    }
														
 
															-
														
 
															-    def substitutes(k, v, c):
														
 
															-        return re.sub(k, v, c)
														
 
															-
														
 
															-    for k, v in patterns.items():
														
 
															-        html_str = re.sub(k, v, substitutes(k, v, html_str), re.S, re.M)
														
 
															+    html_str = cleaner(html=html_str,special=None, completely=False)
														
 
															     return html_str
														
@@ -188,6 +121,15 @@ class CustomCheckError(JyBasicException):
 
															         self.err_details = kwargs
														
 
															         for key, val in kwargs.items():
														
 
															             setattr(self, key, val)
														
 
															+class HtmlEmptyError(JyBasicException):
														
 
															+
														
 
															+    def __init__(self, code: int = 10002, reason: str = '正文获取异常，正文为空', **kwargs):
														
 
															+        self.code = code
														
 
															+        self.reason = reason
														
 
															+        self.err_details = kwargs
														
 
															+        for key, val in kwargs.items():
														
 
															+            setattr(self, key, val)
														
 
															+
														
 
															 class CheckPrePareRequest:
														
 
															     def __init__(self):
														
@@ -200,33 +142,57 @@ class CheckPrePareRequest:
 
															             '终止', '系统'
														
 
															         }
														
 
															-    @staticmethod
														
 
															-    def check_es_cache(title: str, publish_time: int, rows: dict):
														
 
															-        """
														
 
															-
														
 
															-        :param title:  标题
														
 
															-        :param publish_time: 发布时间的时间戳(l_np_publishtime)
														
 
															-        :param rows: 采集内容
														
 
															-        """
														
 
															-        pass
														
 
															-        # retrieved_result = es_query(title, publish_time)
														
 
															-        # if retrieved_result != 0:
														
 
															-        #     '''es查询数据结果'''
														
 
															-        #     rows['count'] = retrieved_result
														
 
															-        #     raise CustomCheckError(code=10105, reason='标题内容已存在es')
														
 
															-
														
 
															     def check_crawl_title(self, title: str):
														
 
															         for keyword in self.crawl_keywords:
														
 
															             valid_keyword = re.search(keyword, title)
														
 
															             if valid_keyword is not None:
														
 
															                 break
														
 
															         else:
														
 
															-            raise CustomCheckError(code=10106, reason='标题未检索到采集关键词', title=title)
														
 
															+            # raise CustomCheckError(code=10106, reason='标题未检索到采集关键词', title=title)
														
 
															+            return 10106,'标题未检索到采集关键词'
														
 
															+        return 200,'ok'
														
 
															+
														
 
															     def __check(self, rows: dict):
														
 
															         title, publish_time = rows['title'], rows['l_np_publishtime']
														
 
															         self.check_crawl_title(title)
														
 
															-        self.check_es_cache(title, publish_time, rows)
														
 
															     def __call__(self, rows: dict, *args, **kwargs):
														
 
															-        self.__check(rows)
														
 
															+        self.__check(rows)
														
 
															+
														
 
															+def get_proxy():
														
 
															+    headers = {
														
 
															+        "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
														
 
															+    }
														
 
															+    proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
														
 
															+    print(f"切换代理：{proxy.get('data')}")
														
 
															+    return proxy.get("data").get("http")
														
 
															+import json
														
 
															+
														
 
															+class Obj(object):
														
 
															+    def __init__(self, dict_):
														
 
															+        self.__dict__.update(dict_)
														
 
															+
														
 
															+def get_argvs():
														
 
															+    argvs = {"next_page":False,"max_page":10}
														
 
															+    for item in sys.argv[1:]:
														
 
															+        print(item)
														
 
															+        if item.startswith("--"):
														
 
															+            argvs[item.replace("--", "").split('=')[0]] = int(item.split('=')[-1])
														
 
															+    return json.loads(json.dumps(argvs), object_hook=Obj)
														
 
															+
														
 
															+def search(pattern, string):
														
 
															+    result = re.search(pattern, string)
														
 
															+    if result:
														
 
															+        return result.groups()[0]
														
 
															+
														
 
															+def search_construction(string):
														
 
															+    result = re.search('pattern', string)
														
 
															+    if result:
														
 
															+        return result.groups()[0]
														
 
															+
														
 
															+def search_floor(string):
														
 
															+    result = re.search('pattern', string)
														
 
															+    if result:
														
 
															+        return result.groups()[0]
														
 
															+
														
--- a/NoteWork/cesspider/__init__.py
+++ b/NoteWork/cesspider/__init__.py
@@ -1,23 +0,0 @@
 
															-__all__ = [
														
 
															-    "ces",
														
 
															-    "css",
														
 
															-    "cssw",
														
 
															-    "demo",
														
 
															-    "down_ces",
														
 
															-    "example",
														
 
															-    "hubeijianzhu",
														
 
															-    "jiangxistouces",
														
 
															-    "magpces",
														
 
															-    "qyzcdzzbcgjypt",
														
 
															-    "中国南方电网电子采购交易平台",
														
 
															-    "交通银行供应商门户",
														
 
															-    "华创迅采电子采购平台",
														
 
															-    "国家税务总局宁波市税务局",
														
 
															-    "城轨采购网",
														
 
															-    "山西省招标投标协会",
														
 
															-    "测试查询",
														
 
															-    "甘肃政府采购网",
														
 
															-    "甘肃政府采购网_ces",
														
 
															-    "甘肃政府采购网_new",
														
 
															-    "福建省政府采购网"
														
 
															-]
														
--- a/NoteWork/cesspider/cesspider
+++ b/NoteWork/cesspider/cesspider
@@ -1,247 +0,0 @@
 
															-Thread-5|2022-01-24 09:41:46,749|parser_control.py|run|line:56|DEBUG| parser 等待任务...
														
 
															-Zglbsww|2022-01-24 09:41:46,754|scheduler.py|<lambda>|line:112|INFO| 
														
 
															-********** feapder begin **********
														
 
															-Thread-4|2022-01-24 09:41:46,758|collector.py|__input_data|line:108|INFO| 重置丢失任务完毕，共8条
														
 
															-Zglbsww|2022-01-24 09:41:46,766|scheduler.py|__add_task|line:215|INFO| 检查到有待做任务 8 条，不重下发新任务，将接着上回异常终止处继续抓取
														
 
															-Thread-5|2022-01-24 09:41:47,763|request.py|get_response|line:305|DEBUG| 
														
 
															-                -------------- Zglbsww.parse request for ----------------
														
 
															-                url  = https://eproport.crecgec.com/epu-portal/portal/project/listWithPage
														
 
															-                method = POST
														
 
															-                body = {'proxies': False, 'data': '{"timeType": "month", "areaCode": "-1", "mainType": "-1", "purchaser": null, "information": null, "sTime": "", "eTime": "", "classify": "-1", "region": "-1", "level": "", "selectedState": "", "purchaseType": "-1", "noticeType": 1, "orders": "publish_time", "dirs": "desc", "current": 1, "size": 10, "page": {}}', 'headers': {'Content-Type': 'application/json', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36'}, 'timeout': 22, 'stream': True, 'verify': False}
														
 
															-                
														
 
															-Thread-3|2022-01-24 09:42:19,071|item_buffer.py|__add_item_to_db|line:300|DEBUG| 
														
 
															-                -------------- item 批量入库 --------------
														
 
															-                表名: mgp_list
														
 
															-                datas: [
														
 
															-                {
														
 
															-                                "parse": "self.detail_get",
														
 
															-                                "item": {
														
 
															-                                                "title": "中铁七局电务公司武汉北III场扩能改造工程电力电缆询价文件",
														
 
															-                                                "publishtime": "2022-01-24 09:40:00",
														
 
															-                                                "spidercode": "a_ztlbsww_jzxtp",
														
 
															-                                                "site": "中铁鲁班商务网",
														
 
															-                                                "channel": "采购公告-竞争性谈判",
														
 
															-                                                "area": "全国",
														
 
															-                                                "city": "",
														
 
															-                                                "competehref": null,
														
 
															-                                                "href": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485414971692441602&tenantId=1",
														
 
															-                                                "publishdept": "",
														
 
															-                                                "iscompete": true,
														
 
															-                                                "type": "",
														
 
															-                                                "T": "bidding",
														
 
															-                                                "l_np_publishtime": "",
														
 
															-                                                "comeintime": "",
														
 
															-                                                "sendflag": "false",
														
 
															-                                                "_d": "comeintime",
														
 
															-                                                "contenthtml": "",
														
 
															-                                                "detail": "",
														
 
															-                                                "projectinfo": null
														
 
															-                                },
														
 
															-                                "parser_name": "details_ztlbw",
														
 
															-                                "date": "2022-01-24 09:41:48",
														
 
															-                                "deal_detail": [
														
 
															-                                                "//*"
														
 
															-                                ],
														
 
															-                                "create_time": null,
														
 
															-                                "parse_url": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485414971692441602&tenantId=1",
														
 
															-                                "request_params": {},
														
 
															-                                "failed": 0,
														
 
															-                                "author": null,
														
 
															-                                "ex_js": "",
														
 
															-                                "ex_python": null,
														
 
															-                                "pri": 1,
														
 
															-                                "proxies": false,
														
 
															-                                "files": {
														
 
															-                                                "list_xpath": "//div[@class=\"****\"]/a",
														
 
															-                                                "url_xpath": "./@href",
														
 
															-                                                "name_xpath": "./text()",
														
 
															-                                                "files_type": [
														
 
															-                                                                "zip",
														
 
															-                                                                "doxc",
														
 
															-                                                                "ftp"
														
 
															-                                                ],
														
 
															-                                                "url_key": "http"
														
 
															-                                },
														
 
															-                                "error": null,
														
 
															-                                "render_time": 3
														
 
															-                }
														
 
															-]
														
 
															-                    
														
 
															-Thread-3|2022-01-24 09:42:19,122|mongo_pipeline.py|save_items|line:49|INFO| 共导出 1 条数据到 mgp_list,  新增 1条, 重复 0 条
														
 
															-Thread-3|2022-01-24 09:42:50,355|item_buffer.py|__add_item_to_db|line:300|DEBUG| 
														
 
															-                -------------- item 批量入库 --------------
														
 
															-                表名: mgp_list
														
 
															-                datas: [
														
 
															-                {
														
 
															-                                "parse": "self.detail_get",
														
 
															-                                "item": {
														
 
															-                                                "title": "中铁七局电务公司武汉北III场扩能改造工程SMC电缆沟支架询价文件",
														
 
															-                                                "publishtime": "2022-01-24 09:39:00",
														
 
															-                                                "spidercode": "a_ztlbsww_jzxtp",
														
 
															-                                                "site": "中铁鲁班商务网",
														
 
															-                                                "channel": "采购公告-竞争性谈判",
														
 
															-                                                "area": "全国",
														
 
															-                                                "city": "",
														
 
															-                                                "competehref": null,
														
 
															-                                                "href": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485423048057802753&tenantId=1",
														
 
															-                                                "publishdept": "",
														
 
															-                                                "iscompete": true,
														
 
															-                                                "type": "",
														
 
															-                                                "T": "bidding",
														
 
															-                                                "l_np_publishtime": "",
														
 
															-                                                "comeintime": "",
														
 
															-                                                "sendflag": "false",
														
 
															-                                                "_d": "comeintime",
														
 
															-                                                "contenthtml": "",
														
 
															-                                                "detail": "",
														
 
															-                                                "projectinfo": null
														
 
															-                                },
														
 
															-                                "parser_name": "details_ztlbw",
														
 
															-                                "date": "2022-01-24 09:42:18",
														
 
															-                                "deal_detail": [
														
 
															-                                                "//*"
														
 
															-                                ],
														
 
															-                                "create_time": null,
														
 
															-                                "parse_url": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485423048057802753&tenantId=1",
														
 
															-                                "request_params": {},
														
 
															-                                "failed": 0,
														
 
															-                                "author": null,
														
 
															-                                "ex_js": "",
														
 
															-                                "ex_python": null,
														
 
															-                                "pri": 1,
														
 
															-                                "proxies": false,
														
 
															-                                "files": {
														
 
															-                                                "list_xpath": "//div[@class=\"****\"]/a",
														
 
															-                                                "url_xpath": "./@href",
														
 
															-                                                "name_xpath": "./text()",
														
 
															-                                                "files_type": [
														
 
															-                                                                "zip",
														
 
															-                                                                "doxc",
														
 
															-                                                                "ftp"
														
 
															-                                                ],
														
 
															-                                                "url_key": "http"
														
 
															-                                },
														
 
															-                                "error": null,
														
 
															-                                "render_time": 3
														
 
															-                }
														
 
															-]
														
 
															-                    
														
 
															-Thread-3|2022-01-24 09:42:50,411|mongo_pipeline.py|save_items|line:49|INFO| 共导出 1 条数据到 mgp_list,  新增 1条, 重复 0 条
														
 
															-Thread-3|2022-01-24 09:43:21,545|item_buffer.py|__add_item_to_db|line:300|DEBUG| 
														
 
															-                -------------- item 批量入库 --------------
														
 
															-                表名: mgp_list
														
 
															-                datas: [
														
 
															-                {
														
 
															-                                "parse": "self.detail_get",
														
 
															-                                "item": {
														
 
															-                                                "title": "轨道交通B1线项目砂浆采购询价书",
														
 
															-                                                "publishtime": "2022-01-24 09:39:00",
														
 
															-                                                "spidercode": "a_ztlbsww_jzxtp",
														
 
															-                                                "site": "中铁鲁班商务网",
														
 
															-                                                "channel": "采购公告-竞争性谈判",
														
 
															-                                                "area": "全国",
														
 
															-                                                "city": "",
														
 
															-                                                "competehref": null,
														
 
															-                                                "href": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485182479012319233&tenantId=1",
														
 
															-                                                "publishdept": "",
														
 
															-                                                "iscompete": true,
														
 
															-                                                "type": "",
														
 
															-                                                "T": "bidding",
														
 
															-                                                "l_np_publishtime": "",
														
 
															-                                                "comeintime": "",
														
 
															-                                                "sendflag": "false",
														
 
															-                                                "_d": "comeintime",
														
 
															-                                                "contenthtml": "",
														
 
															-                                                "detail": "",
														
 
															-                                                "projectinfo": null
														
 
															-                                },
														
 
															-                                "parser_name": "details_ztlbw",
														
 
															-                                "date": "2022-01-24 09:42:48",
														
 
															-                                "deal_detail": [
														
 
															-                                                "//*"
														
 
															-                                ],
														
 
															-                                "create_time": null,
														
 
															-                                "parse_url": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485182479012319233&tenantId=1",
														
 
															-                                "request_params": {},
														
 
															-                                "failed": 0,
														
 
															-                                "author": null,
														
 
															-                                "ex_js": "",
														
 
															-                                "ex_python": null,
														
 
															-                                "pri": 1,
														
 
															-                                "proxies": false,
														
 
															-                                "files": {
														
 
															-                                                "list_xpath": "//div[@class=\"****\"]/a",
														
 
															-                                                "url_xpath": "./@href",
														
 
															-                                                "name_xpath": "./text()",
														
 
															-                                                "files_type": [
														
 
															-                                                                "zip",
														
 
															-                                                                "doxc",
														
 
															-                                                                "ftp"
														
 
															-                                                ],
														
 
															-                                                "url_key": "http"
														
 
															-                                },
														
 
															-                                "error": null,
														
 
															-                                "render_time": 3
														
 
															-                }
														
 
															-]
														
 
															-                    
														
 
															-Thread-3|2022-01-24 09:43:21,575|mongo_pipeline.py|save_items|line:49|INFO| 共导出 1 条数据到 mgp_list,  新增 1条, 重复 0 条
														
 
															-Thread-3|2022-01-24 09:43:52,756|item_buffer.py|__add_item_to_db|line:300|DEBUG| 
														
 
															-                -------------- item 批量入库 --------------
														
 
															-                表名: mgp_list
														
 
															-                datas: [
														
 
															-                {
														
 
															-                                "parse": "self.detail_get",
														
 
															-                                "item": {
														
 
															-                                                "title": "中铁九局集团第四工程有限公司沈阳地铁6号线一期工程土建施工第三合同段项目经理部玻璃纤维筋询价采购",
														
 
															-                                                "publishtime": "2022-01-24 09:34:00",
														
 
															-                                                "spidercode": "a_ztlbsww_jzxtp",
														
 
															-                                                "site": "中铁鲁班商务网",
														
 
															-                                                "channel": "采购公告-竞争性谈判",
														
 
															-                                                "area": "全国",
														
 
															-                                                "city": "",
														
 
															-                                                "competehref": null,
														
 
															-                                                "href": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485421040418316289&tenantId=1",
														
 
															-                                                "publishdept": "",
														
 
															-                                                "iscompete": true,
														
 
															-                                                "type": "",
														
 
															-                                                "T": "bidding",
														
 
															-                                                "l_np_publishtime": "",
														
 
															-                                                "comeintime": "",
														
 
															-                                                "sendflag": "false",
														
 
															-                                                "_d": "comeintime",
														
 
															-                                                "contenthtml": "",
														
 
															-                                                "detail": "",
														
 
															-                                                "projectinfo": null
														
 
															-                                },
														
 
															-                                "parser_name": "details_ztlbw",
														
 
															-                                "date": "2022-01-24 09:43:18",
														
 
															-                                "deal_detail": [
														
 
															-                                                "//*"
														
 
															-                                ],
														
 
															-                                "create_time": null,
														
 
															-                                "parse_url": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485421040418316289&tenantId=1",
														
 
															-                                "request_params": {},
														
 
															-                                "failed": 0,
														
 
															-                                "author": null,
														
 
															-                                "ex_js": "",
														
 
															-                                "ex_python": null,
														
 
															-                                "pri": 1,
														
 
															-                                "proxies": false,
														
 
															-                                "files": {
														
 
															-                                                "list_xpath": "//div[@class=\"****\"]/a",
														
 
															-                                                "url_xpath": "./@href",
														
 
															-                                                "name_xpath": "./text()",
														
 
															-                                                "files_type": [
														
 
															-                                                                "zip",
														
 
															-                                                                "doxc",
														
 
															-                                                                "ftp"
														
 
															-                                                ],
														
 
															-                                                "url_key": "http"
														
 
															-                                },
														
 
															-                                "error": null,
														
 
															-                                "render_time": 3
														
 
															-                }
														
 
															-]
														
 
															-                    
														
 
															-Thread-3|2022-01-24 09:43:52,776|mongo_pipeline.py|save_items|line:49|INFO| 共导出 1 条数据到 mgp_list,  新增 1条, 重复 0 条
														
--- a/NoteWork/cesspider/hubeijianzhu.py
+++ b/NoteWork/cesspider/hubeijianzhu.py
@@ -1,6 +0,0 @@
 
															-import requests
														
 
															-
														
 
															-url= 'http://jg.hbcic.net.cn/web/XmManage/XmxxSearch.aspx'
														
 
															-res = requests.get(url)
														
 
															-print(res.text)
														
 
															-print(res.status_code)
														
--- a/NoteWork/cesspider/jiangxistouces.py
+++ b/NoteWork/cesspider/jiangxistouces.py
@@ -1,50 +0,0 @@
 
															-import requests
														
 
															-
														
 
															-headers = {
														
 
															-    "Connection": "keep-alive",
														
 
															-    "Cache-Control": "max-age=0",
														
 
															-    "Upgrade-Insecure-Requests": "1",
														
 
															-    "Origin": "http://www.ccgp-jilin.gov.cn",
														
 
															-    "Content-Type": "application/x-www-form-urlencoded",
														
 
															-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
														
 
															-    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
														
 
															-    "Referer": "http://www.ccgp-jilin.gov.cn/ext/search/morePolicyNews.action",
														
 
															-    "Accept-Language": "zh-CN,zh;q=0.9"
														
 
															-}
														
 
															-cookies = {
														
 
															-    "_gscu_1208125908": "40588857p73qs140",
														
 
															-    "_gscbrs_1208125908": "1"
														
 
															-}
														
 
															-url = "http://www.ccgp-jilin.gov.cn/ext/search/morePolicyNews.action"
														
 
															-data = {
														
 
															-    "currentPage": "3",
														
 
															-    "noticetypeId": "1",
														
 
															-    "categoryId": "124",
														
 
															-    "articleId": "1",
														
 
															-    "ss": "41544c156ff82a74717a3c47c49a00d1017ab072dbbbba7bd0a0dbd087ba7b776b866418113ee01042af289114c1de749c6c79942d413015bdbfb6f2b59eba1280a7b2a6589c2a4db0d8ad7b2b5acfdd6e97a3fea1cf7cdf4bfa207d0990edf214eee9324d40425029e9fd958c810c6f86866257c925b4149bf76b6a8d338857",
														
 
															-    "id": "1",
														
 
															-    "pager.pageNumber": "1",
														
 
															-    "las": "38"
														
 
															-}
														
 
															-response = requests.post(url, data=data, verify=False)
														
 
															-
														
 
															-print(response.text)
														
 
															-print(response)
														
 
															-if '前台同步提交表单使用验证方法之后无效' in response.text:
														
 
															-    print('加密失败')
														
 
															-import execjs
														
 
															-
														
 
															-# js_str = '''
														
 
															-# function subdd(){
														
 
															-# var _0x2adf75 = '10110';
														
 
															-# var _0x5d50cf = '_0x40dee9['map']['modulus']';
														
 
															-# var _0x209e59 = 'UsVjaS7Wj4';
														
 
															-# var _0x3e26ea = '009f40b74f0e3bfcf449431eceedaa7984d852a754038eed36091fb0d6c1390b647f56dd82d8953c6e97678e20c7d3976ee3639dc386a2676578596bea3766d9f8f2402a8300b3cfb987dfeee63159ab1cdfe41f04fc2446f17ad8ee2878df59cba50ea4af18f6238172b55129dd7357adb90af15a3ed02bcc0bcc68d4f6c6696f';
														
 
															-# _0x43ae35 = encryptedString(_0x3e26ea, encodeURIComponent(_0x209e59));
														
 
															-# return _0x43ae35;}
														
 
															-# '''
														
 
															-# node_model_path = "D:/剑鱼爬虫/py_spiders/node_modules"
														
 
															-# # with open('..cesspider/js/ces.js', 'rb') as f:
														
 
															-# #     js_str = f.read()
														
 
															-# ctx = execjs.compile(js_str)
														
 
															-# res = ctx.call('subdd')
														
--- a/NoteWork/cesspider/js/rsa/Barrett.js
+++ b/NoteWork/cesspider/js/rsa/Barrett.js
@@ -1,80 +0,0 @@
 
															-// BarrettMu, a class for performing Barrett modular reduction computations in
														
 
															-// JavaScript.
														
 
															-//
														
 
															-// Requires BigInt.js.
														
 
															-//
														
 
															-// Copyright 2004-2005 David Shapiro.
														
 
															-//
														
 
															-// You may use, re-use, abuse, copy, and modify this code to your liking, but
														
 
															-// please keep this header.
														
 
															-//
														
 
															-// Thanks!
														
 
															-// 
														
 
															-// Dave Shapiro
														
 
															-// dave@ohdave.com 
														
 
															-const BigInt = require("./BigInt");
														
 
															-var biCopy = BigInt.biCopy
														
 
															-var biHighIndex = BigInt.biHighIndex
														
 
															-var bigInt = BigInt.bigInt
														
 
															-var biDivide = BigInt.biDivide
														
 
															-
														
 
															-
														
 
															-function BarrettMu(m)
														
 
															-{
														
 
															-	this.modulus = biCopy(m);
														
 
															-	this.k = biHighIndex(this.modulus) + 1;
														
 
															-	var b2k = new bigInt();
														
 
															-	b2k.digits[2 * this.k] = 1; // b2k = b^(2k)
														
 
															-	this.mu = biDivide(b2k, this.modulus);
														
 
															-	this.bkplus1 = new bigInt();
														
 
															-	this.bkplus1.digits[this.k + 1] = 1; // bkplus1 = b^(k+1)
														
 
															-	this.modulo = BarrettMu_modulo;
														
 
															-	this.multiplyMod = BarrettMu_multiplyMod;
														
 
															-	this.powMod = BarrettMu_powMod;
														
 
															-}
														
 
															-
														
 
															-function BarrettMu_modulo(x)
														
 
															-{
														
 
															-	var q1 = biDivideByRadixPower(x, this.k - 1);
														
 
															-	var q2 = biMultiply(q1, this.mu);
														
 
															-	var q3 = biDivideByRadixPower(q2, this.k + 1);
														
 
															-	var r1 = biModuloByRadixPower(x, this.k + 1);
														
 
															-	var r2term = biMultiply(q3, this.modulus);
														
 
															-	var r2 = biModuloByRadixPower(r2term, this.k + 1);
														
 
															-	var r = biSubtract(r1, r2);
														
 
															-	if (r.isNeg) {
														
 
															-		r = biAdd(r, this.bkplus1);
														
 
															-	}
														
 
															-	var rgtem = biCompare(r, this.modulus) >= 0;
														
 
															-	while (rgtem) {
														
 
															-		r = biSubtract(r, this.modulus);
														
 
															-		rgtem = biCompare(r, this.modulus) >= 0;
														
 
															-	}
														
 
															-	return r;
														
 
															-}
														
 
															-
														
 
															-function BarrettMu_multiplyMod(x, y)
														
 
															-{
														
 
															-	/*
														
 
															-	x = this.modulo(x);
														
 
															-	y = this.modulo(y);
														
 
															-	*/
														
 
															-	var xy = biMultiply(x, y);
														
 
															-	return this.modulo(xy);
														
 
															-}
														
 
															-
														
 
															-function BarrettMu_powMod(x, y)
														
 
															-{
														
 
															-	var result = new bigInt();
														
 
															-	result.digits[0] = 1;
														
 
															-	var a = x;
														
 
															-	var k = y;
														
 
															-	while (true) {
														
 
															-		if ((k.digits[0] & 1) != 0) result = this.multiplyMod(result, a);
														
 
															-		k = biShiftRight(k, 1);
														
 
															-		if (k.digits[0] == 0 && biHighIndex(k) == 0) break;
														
 
															-		a = this.multiplyMod(a, a);
														
 
															-	}
														
 
															-	return result;
														
 
															-}
														
 
															-module.exports.BarrettMu = BarrettMu
														
--- a/NoteWork/cesspider/js/rsa/BigInt.js
+++ b/NoteWork/cesspider/js/rsa/BigInt.js
@@ -1,614 +0,0 @@
 
															-// BigInt, a suite of routines for performing multiple-precision arithmetic in
														
 
															-// JavaScript.
														
 
															-//
														
 
															-// Copyright 1998-2005 David Shapiro.
														
 
															-//
														
 
															-// You may use, re-use, abuse,
														
 
															-// copy, and modify this code to your liking, but please keep this header.
														
 
															-// Thanks!
														
 
															-//
														
 
															-// Dave Shapiro
														
 
															-// dave@ohdave.com
														
 
															-
														
 
															-// IMPORTANT THING: Be sure to set maxDigits according to your precision
														
 
															-// needs. Use the setMaxDigits() function to do this. See comments below.
														
 
															-//
														
 
															-// Tweaked by Ian Bunning
														
 
															-// Alterations:
														
 
															-// Fix bug in function biFromHex(s) to allow
														
 
															-// parsing of strings of length != 0 (mod 4)
														
 
															-
														
 
															-// Changes made by Dave Shapiro as of 12/30/2004:
														
 
															-//
														
 
															-// The BigInt() constructor doesn't take a string anymore. If you want to
														
 
															-// create a BigInt from a string, use biFromDecimal() for base-10
														
 
															-// representations, biFromHex() for base-16 representations, or
														
 
															-// biFromString() for base-2-to-36 representations.
														
 
															-//
														
 
															-// biFromArray() has been removed. Use biCopy() instead, passing a BigInt
														
 
															-// instead of an array.
														
 
															-//
														
 
															-// The BigInt() constructor now only constructs a zeroed-out array.
														
 
															-// Alternatively, if you pass <true>, it won't construct any array. See the
														
 
															-// biCopy() method for an example of this.
														
 
															-//
														
 
															-// Be sure to set maxDigits depending on your precision needs. The default
														
 
															-// zeroed-out array ZERO_ARRAY is constructed inside the setMaxDigits()
														
 
															-// function. So use this function to set the variable. DON'T JUST SET THE
														
 
															-// VALUE. USE THE FUNCTION.
														
 
															-//
														
 
															-// ZERO_ARRAY exists to hopefully speed up construction of BigInts(). By
														
 
															-// precalculating the zero array, we can just use slice(0) to make copies of
														
 
															-// it. Presumably this calls faster native code, as opposed to setting the
														
 
															-// elements one at a time. I have not done any timing tests to verify this
														
 
															-// claim.
														
 
															-
														
 
															-// Max number = 10^16 - 2 = 9999999999999998;
														
 
															-//               2^53     = 9007199254740992;
														
 
															-
														
 
															-var biRadixBase = 2;
														
 
															-var biRadixBits = 16;
														
 
															-var bitsPerDigit = biRadixBits;
														
 
															-var biRadix = 1 << 16;
														
 
															-// = 2^16 = 65536
														
 
															-var biHalfRadix = biRadix >>> 1;
														
 
															-var biRadixSquared = biRadix * biRadix;
														
 
															-var maxDigitVal = biRadix - 1;
														
 
															-var maxInteger = 9999999999999998;
														
 
															-
														
 
															-// maxDigits:
														
 
															-// Change this to accommodate your largest number size. Use setMaxDigits()
														
 
															-// to change it!
														
 
															-//
														
 
															-// In general, if you're working with numbers of size N bits, you'll need 2*N
														
 
															-// bits of storage. Each digit holds 16 bits. So, a 1024-bit key will need
														
 
															-//
														
 
															-// 1024 * 2 / 16 = 128 digits of storage.
														
 
															-//
														
 
															-
														
 
															-var maxDigits;
														
 
															-var ZERO_ARRAY;
														
 
															-var bigZero, bigOne;
														
 
															-
														
 
															-function setMaxDigits(value) {
														
 
															-    maxDigits = value;
														
 
															-    ZERO_ARRAY = new Array(maxDigits);
														
 
															-    for (var iza = 0; iza < ZERO_ARRAY.length; iza++)
														
 
															-        ZERO_ARRAY[iza] = 0;
														
 
															-    bigZero = new BigInt();
														
 
															-    bigOne = new BigInt();
														
 
															-    bigOne.digits[0] = 1;
														
 
															-}
														
 
															-
														
 
															-setMaxDigits(20);
														
 
															-
														
 
															-// The maximum number of digits in base 10 you can convert to an
														
 
															-// integer without JavaScript throwing up on you.
														
 
															-var dpl10 = 15;
														
 
															-// lr10 = 10 ^ dpl10
														
 
															-var lr10 = biFromNumber(1000000000000000);
														
 
															-
														
 
															-function BigInt(flag) {
														
 
															-    if (typeof flag == "boolean" && flag == true) {
														
 
															-        this.digits = null;
														
 
															-    } else {
														
 
															-        this.digits = ZERO_ARRAY.slice(0);
														
 
															-    }
														
 
															-    this.isNeg = false;
														
 
															-}
														
 
															-
														
 
															-function biFromDecimal(s) {
														
 
															-    var isNeg = s.charAt(0) == '-';
														
 
															-    var i = isNeg ? 1 : 0;
														
 
															-    var result;
														
 
															-    // Skip leading zeros.
														
 
															-    while (i < s.length && s.charAt(i) == '0')
														
 
															-        ++i;
														
 
															-    if (i == s.length) {
														
 
															-        result = new BigInt();
														
 
															-    } else {
														
 
															-        var digitCount = s.length - i;
														
 
															-        var fgl = digitCount % dpl10;
														
 
															-        if (fgl == 0)
														
 
															-            fgl = dpl10;
														
 
															-        result = biFromNumber(Number(s.substr(i, fgl)));
														
 
															-        i += fgl;
														
 
															-        while (i < s.length) {
														
 
															-            result = biAdd(biMultiply(result, lr10), biFromNumber(Number(s.substr(i, dpl10))));
														
 
															-            i += dpl10;
														
 
															-        }
														
 
															-        result.isNeg = isNeg;
														
 
															-    }
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function biCopy(bi) {
														
 
															-    var result = new BigInt(true);
														
 
															-    result.digits = bi.digits.slice(0);
														
 
															-    result.isNeg = bi.isNeg;
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function biFromNumber(i) {
														
 
															-    var result = new BigInt();
														
 
															-    result.isNeg = i < 0;
														
 
															-    i = Math.abs(i);
														
 
															-    var j = 0;
														
 
															-    while (i > 0) {
														
 
															-        result.digits[j++] = i & maxDigitVal;
														
 
															-        i >>= biRadixBits;
														
 
															-    }
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function reverseStr(s) {
														
 
															-    var result = "";
														
 
															-    for (var i = s.length - 1; i > -1; --i) {
														
 
															-        result += s.charAt(i);
														
 
															-    }
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-var hexatrigesimalToChar = new Array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z');
														
 
															-
														
 
															-function biToString(x, radix) // 2 <= radix <= 36
														
 
															-{
														
 
															-    var b = new BigInt();
														
 
															-    b.digits[0] = radix;
														
 
															-    var qr = biDivideModulo(x, b);
														
 
															-    var result = hexatrigesimalToChar[qr[1].digits[0]];
														
 
															-    while (biCompare(qr[0], bigZero) == 1) {
														
 
															-        qr = biDivideModulo(qr[0], b);
														
 
															-        digit = qr[1].digits[0];
														
 
															-        result += hexatrigesimalToChar[qr[1].digits[0]];
														
 
															-    }
														
 
															-    return (x.isNeg ? "-" : "") + reverseStr(result);
														
 
															-}
														
 
															-
														
 
															-function biToDecimal(x) {
														
 
															-    var b = new BigInt();
														
 
															-    b.digits[0] = 10;
														
 
															-    var qr = biDivideModulo(x, b);
														
 
															-    var result = String(qr[1].digits[0]);
														
 
															-    while (biCompare(qr[0], bigZero) == 1) {
														
 
															-        qr = biDivideModulo(qr[0], b);
														
 
															-        result += String(qr[1].digits[0]);
														
 
															-    }
														
 
															-    return (x.isNeg ? "-" : "") + reverseStr(result);
														
 
															-}
														
 
															-
														
 
															-var hexToChar = new Array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f');
														
 
															-
														
 
															-function digitToHex(n) {
														
 
															-    var mask = 0xf;
														
 
															-    var result = "";
														
 
															-    for (i = 0; i < 4; ++i) {
														
 
															-        result += hexToChar[n & mask];
														
 
															-        n >>>= 4;
														
 
															-    }
														
 
															-    return reverseStr(result);
														
 
															-}
														
 
															-
														
 
															-function biToHex(x) {
														
 
															-    var result = "";
														
 
															-    var n = biHighIndex(x);
														
 
															-    for (var i = biHighIndex(x); i > -1; --i) {
														
 
															-        result += digitToHex(x.digits[i]);
														
 
															-    }
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function charToHex(c) {
														
 
															-    var ZERO = 48;
														
 
															-    var NINE = ZERO + 9;
														
 
															-    var littleA = 97;
														
 
															-    var littleZ = littleA + 25;
														
 
															-    var bigA = 65;
														
 
															-    var bigZ = 65 + 25;
														
 
															-    var result;
														
 
															-
														
 
															-    if (c >= ZERO && c <= NINE) {
														
 
															-        result = c - ZERO;
														
 
															-    } else if (c >= bigA && c <= bigZ) {
														
 
															-        result = 10 + c - bigA;
														
 
															-    } else if (c >= littleA && c <= littleZ) {
														
 
															-        result = 10 + c - littleA;
														
 
															-    } else {
														
 
															-        result = 0;
														
 
															-    }
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function hexToDigit(s) {
														
 
															-    var result = 0;
														
 
															-    var sl = Math.min(s.length, 4);
														
 
															-    for (var i = 0; i < sl; ++i) {
														
 
															-        result <<= 4;
														
 
															-        result |= charToHex(s.charCodeAt(i))
														
 
															-    }
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function biFromHex(s) {
														
 
															-    var result = new BigInt();
														
 
															-    var sl = s.length;
														
 
															-    for (var i = sl, j = 0; i > 0; i -= 4,
														
 
															-    ++j) {
														
 
															-        result.digits[j] = hexToDigit(s.substr(Math.max(i - 4, 0), Math.min(i, 4)));
														
 
															-    }
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function biFromString(s, radix) {
														
 
															-    var isNeg = s.charAt(0) == '-';
														
 
															-    var istop = isNeg ? 1 : 0;
														
 
															-    var result = new BigInt();
														
 
															-    var place = new BigInt();
														
 
															-    place.digits[0] = 1;
														
 
															-    // radix^0
														
 
															-    for (var i = s.length - 1; i >= istop; i--) {
														
 
															-        var c = s.charCodeAt(i);
														
 
															-        var digit = charToHex(c);
														
 
															-        var biDigit = biMultiplyDigit(place, digit);
														
 
															-        result = biAdd(result, biDigit);
														
 
															-        place = biMultiplyDigit(place, radix);
														
 
															-    }
														
 
															-    result.isNeg = isNeg;
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function biToBytes(x) // Returns a string containing raw bytes.
														
 
															-{
														
 
															-    var result = "";
														
 
															-    for (var i = biHighIndex(x); i > -1; --i) {
														
 
															-        result += digitToBytes(x.digits[i]);
														
 
															-    }
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function digitToBytes(n) // Convert two-byte digit to string containing both bytes.
														
 
															-{
														
 
															-    var c1 = String.fromCharCode(n & 0xff);
														
 
															-    n >>>= 8;
														
 
															-    var c2 = String.fromCharCode(n & 0xff);
														
 
															-    return c2 + c1;
														
 
															-}
														
 
															-
														
 
															-function biDump(b) {
														
 
															-    return (b.isNeg ? "-" : "") + b.digits.join(" ");
														
 
															-}
														
 
															-
														
 
															-function biAdd(x, y) {
														
 
															-    var result;
														
 
															-
														
 
															-    if (x.isNeg != y.isNeg) {
														
 
															-        y.isNeg = !y.isNeg;
														
 
															-        result = biSubtract(x, y);
														
 
															-        y.isNeg = !y.isNeg;
														
 
															-    } else {
														
 
															-        result = new BigInt();
														
 
															-        var c = 0;
														
 
															-        var n;
														
 
															-        for (var i = 0; i < x.digits.length; ++i) {
														
 
															-            n = x.digits[i] + y.digits[i] + c;
														
 
															-            result.digits[i] = n & 0xffff;
														
 
															-            c = Number(n >= biRadix);
														
 
															-        }
														
 
															-        result.isNeg = x.isNeg;
														
 
															-    }
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function biSubtract(x, y) {
														
 
															-    var result;
														
 
															-    if (x.isNeg != y.isNeg) {
														
 
															-        y.isNeg = !y.isNeg;
														
 
															-        result = biAdd(x, y);
														
 
															-        y.isNeg = !y.isNeg;
														
 
															-    } else {
														
 
															-        result = new BigInt();
														
 
															-        var n, c;
														
 
															-        c = 0;
														
 
															-        for (var i = 0; i < x.digits.length; ++i) {
														
 
															-            n = x.digits[i] - y.digits[i] + c;
														
 
															-            result.digits[i] = n & 0xffff;
														
 
															-            // Stupid non-conforming modulus operation.
														
 
															-            if (result.digits[i] < 0)
														
 
															-                result.digits[i] += biRadix;
														
 
															-            c = 0 - Number(n < 0);
														
 
															-        }
														
 
															-        // Fix up the negative sign, if any.
														
 
															-        if (c == -1) {
														
 
															-            c = 0;
														
 
															-            for (var i = 0; i < x.digits.length; ++i) {
														
 
															-                n = 0 - result.digits[i] + c;
														
 
															-                result.digits[i] = n & 0xffff;
														
 
															-                // Stupid non-conforming modulus operation.
														
 
															-                if (result.digits[i] < 0)
														
 
															-                    result.digits[i] += biRadix;
														
 
															-                c = 0 - Number(n < 0);
														
 
															-            }
														
 
															-            // Result is opposite sign of arguments.
														
 
															-            result.isNeg = !x.isNeg;
														
 
															-        } else {
														
 
															-            // Result is same sign.
														
 
															-            result.isNeg = x.isNeg;
														
 
															-        }
														
 
															-    }
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function biHighIndex(x) {
														
 
															-    var result = x.digits.length - 1;
														
 
															-    while (result > 0 && x.digits[result] == 0)
														
 
															-        --result;
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function biNumBits(x) {
														
 
															-    var n = biHighIndex(x);
														
 
															-    var d = x.digits[n];
														
 
															-    var m = (n + 1) * bitsPerDigit;
														
 
															-    var result;
														
 
															-    for (result = m; result > m - bitsPerDigit; --result) {
														
 
															-        if ((d & 0x8000) != 0)
														
 
															-            break;
														
 
															-        d <<= 1;
														
 
															-    }
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function biMultiply(x, y) {
														
 
															-    var result = new BigInt();
														
 
															-    var c;
														
 
															-    var n = biHighIndex(x);
														
 
															-    var t = biHighIndex(y);
														
 
															-    var u, uv, k;
														
 
															-
														
 
															-    for (var i = 0; i <= t; ++i) {
														
 
															-        c = 0;
														
 
															-        k = i;
														
 
															-        for (j = 0; j <= n; ++j,
														
 
															-        ++k) {
														
 
															-            uv = result.digits[k] + x.digits[j] * y.digits[i] + c;
														
 
															-            result.digits[k] = uv & maxDigitVal;
														
 
															-            c = uv >>> biRadixBits;
														
 
															-        }
														
 
															-        result.digits[i + n + 1] = c;
														
 
															-    }
														
 
															-    // Someone give me a logical xor, please.
														
 
															-    result.isNeg = x.isNeg != y.isNeg;
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function biMultiplyDigit(x, y) {
														
 
															-    var n, c, uv;
														
 
															-
														
 
															-    result = new BigInt();
														
 
															-    n = biHighIndex(x);
														
 
															-    c = 0;
														
 
															-    for (var j = 0; j <= n; ++j) {
														
 
															-        uv = result.digits[j] + x.digits[j] * y + c;
														
 
															-        result.digits[j] = uv & maxDigitVal;
														
 
															-        c = uv >>> biRadixBits;
														
 
															-    }
														
 
															-    result.digits[1 + n] = c;
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function arrayCopy(src, srcStart, dest, destStart, n) {
														
 
															-    var m = Math.min(srcStart + n, src.length);
														
 
															-    for (var i = srcStart, j = destStart; i < m; ++i,
														
 
															-    ++j) {
														
 
															-        dest[j] = src[i];
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-var highBitMasks = new Array(0x0000,0x8000,0xC000,0xE000,0xF000,0xF800,0xFC00,0xFE00,0xFF00,0xFF80,0xFFC0,0xFFE0,0xFFF0,0xFFF8,0xFFFC,0xFFFE,0xFFFF);
														
 
															-
														
 
															-function biShiftLeft(x, n) {
														
 
															-    var digitCount = Math.floor(n / bitsPerDigit);
														
 
															-    var result = new BigInt();
														
 
															-    arrayCopy(x.digits, 0, result.digits, digitCount, result.digits.length - digitCount);
														
 
															-    var bits = n % bitsPerDigit;
														
 
															-    var rightBits = bitsPerDigit - bits;
														
 
															-    for (var i = result.digits.length - 1, i1 = i - 1; i > 0; --i,
														
 
															-    --i1) {
														
 
															-        result.digits[i] = ((result.digits[i] << bits) & maxDigitVal) | ((result.digits[i1] & highBitMasks[bits]) >>> (rightBits));
														
 
															-    }
														
 
															-    result.digits[0] = ((result.digits[i] << bits) & maxDigitVal);
														
 
															-    result.isNeg = x.isNeg;
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-var lowBitMasks = new Array(0x0000,0x0001,0x0003,0x0007,0x000F,0x001F,0x003F,0x007F,0x00FF,0x01FF,0x03FF,0x07FF,0x0FFF,0x1FFF,0x3FFF,0x7FFF,0xFFFF);
														
 
															-
														
 
															-function biShiftRight(x, n) {
														
 
															-    var digitCount = Math.floor(n / bitsPerDigit);
														
 
															-    var result = new BigInt();
														
 
															-    arrayCopy(x.digits, digitCount, result.digits, 0, x.digits.length - digitCount);
														
 
															-    var bits = n % bitsPerDigit;
														
 
															-    var leftBits = bitsPerDigit - bits;
														
 
															-    for (var i = 0, i1 = i + 1; i < result.digits.length - 1; ++i,
														
 
															-    ++i1) {
														
 
															-        result.digits[i] = (result.digits[i] >>> bits) | ((result.digits[i1] & lowBitMasks[bits]) << leftBits);
														
 
															-    }
														
 
															-    result.digits[result.digits.length - 1] >>>= bits;
														
 
															-    result.isNeg = x.isNeg;
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function biMultiplyByRadixPower(x, n) {
														
 
															-    var result = new BigInt();
														
 
															-    arrayCopy(x.digits, 0, result.digits, n, result.digits.length - n);
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function biDivideByRadixPower(x, n) {
														
 
															-    var result = new BigInt();
														
 
															-    arrayCopy(x.digits, n, result.digits, 0, result.digits.length - n);
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function biModuloByRadixPower(x, n) {
														
 
															-    var result = new BigInt();
														
 
															-    arrayCopy(x.digits, 0, result.digits, 0, n);
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function biCompare(x, y) {
														
 
															-    if (x.isNeg != y.isNeg) {
														
 
															-        return 1 - 2 * Number(x.isNeg);
														
 
															-    }
														
 
															-    for (var i = x.digits.length - 1; i >= 0; --i) {
														
 
															-        if (x.digits[i] != y.digits[i]) {
														
 
															-            if (x.isNeg) {
														
 
															-                return 1 - 2 * Number(x.digits[i] > y.digits[i]);
														
 
															-            } else {
														
 
															-                return 1 - 2 * Number(x.digits[i] < y.digits[i]);
														
 
															-            }
														
 
															-        }
														
 
															-    }
														
 
															-    return 0;
														
 
															-}
														
 
															-
														
 
															-function biDivideModulo(x, y) {
														
 
															-    var nb = biNumBits(x);
														
 
															-    var tb = biNumBits(y);
														
 
															-    var origYIsNeg = y.isNeg;
														
 
															-    var q, r;
														
 
															-    if (nb < tb) {
														
 
															-        // |x| < |y|
														
 
															-        if (x.isNeg) {
														
 
															-            q = biCopy(bigOne);
														
 
															-            q.isNeg = !y.isNeg;
														
 
															-            x.isNeg = false;
														
 
															-            y.isNeg = false;
														
 
															-            r = biSubtract(y, x);
														
 
															-            // Restore signs, 'cause they're references.
														
 
															-            x.isNeg = true;
														
 
															-            y.isNeg = origYIsNeg;
														
 
															-        } else {
														
 
															-            q = new BigInt();
														
 
															-            r = biCopy(x);
														
 
															-        }
														
 
															-        return new Array(q,r);
														
 
															-    }
														
 
															-
														
 
															-    q = new BigInt();
														
 
															-    r = x;
														
 
															-
														
 
															-    // Normalize Y.
														
 
															-    var t = Math.ceil(tb / bitsPerDigit) - 1;
														
 
															-    var lambda = 0;
														
 
															-    while (y.digits[t] < biHalfRadix) {
														
 
															-        y = biShiftLeft(y, 1);
														
 
															-        ++lambda;
														
 
															-        ++tb;
														
 
															-        t = Math.ceil(tb / bitsPerDigit) - 1;
														
 
															-    }
														
 
															-    // Shift r over to keep the quotient constant. We'll shift the
														
 
															-    // remainder back at the end.
														
 
															-    r = biShiftLeft(r, lambda);
														
 
															-    nb += lambda;
														
 
															-    // Update the bit count for x.
														
 
															-    var n = Math.ceil(nb / bitsPerDigit) - 1;
														
 
															-
														
 
															-    var b = biMultiplyByRadixPower(y, n - t);
														
 
															-    while (biCompare(r, b) != -1) {
														
 
															-        ++q.digits[n - t];
														
 
															-        r = biSubtract(r, b);
														
 
															-    }
														
 
															-    for (var i = n; i > t; --i) {
														
 
															-        var ri = (i >= r.digits.length) ? 0 : r.digits[i];
														
 
															-        var ri1 = (i - 1 >= r.digits.length) ? 0 : r.digits[i - 1];
														
 
															-        var ri2 = (i - 2 >= r.digits.length) ? 0 : r.digits[i - 2];
														
 
															-        var yt = (t >= y.digits.length) ? 0 : y.digits[t];
														
 
															-        var yt1 = (t - 1 >= y.digits.length) ? 0 : y.digits[t - 1];
														
 
															-        if (ri == yt) {
														
 
															-            q.digits[i - t - 1] = maxDigitVal;
														
 
															-        } else {
														
 
															-            q.digits[i - t - 1] = Math.floor((ri * biRadix + ri1) / yt);
														
 
															-        }
														
 
															-
														
 
															-        var c1 = q.digits[i - t - 1] * ((yt * biRadix) + yt1);
														
 
															-        var c2 = (ri * biRadixSquared) + ((ri1 * biRadix) + ri2);
														
 
															-        while (c1 > c2) {
														
 
															-            --q.digits[i - t - 1];
														
 
															-            c1 = q.digits[i - t - 1] * ((yt * biRadix) | yt1);
														
 
															-            c2 = (ri * biRadix * biRadix) + ((ri1 * biRadix) + ri2);
														
 
															-        }
														
 
															-
														
 
															-        b = biMultiplyByRadixPower(y, i - t - 1);
														
 
															-        r = biSubtract(r, biMultiplyDigit(b, q.digits[i - t - 1]));
														
 
															-        if (r.isNeg) {
														
 
															-            r = biAdd(r, b);
														
 
															-            --q.digits[i - t - 1];
														
 
															-        }
														
 
															-    }
														
 
															-    r = biShiftRight(r, lambda);
														
 
															-    // Fiddle with the signs and stuff to make sure that 0 <= r < y.
														
 
															-    q.isNeg = x.isNeg != origYIsNeg;
														
 
															-    if (x.isNeg) {
														
 
															-        if (origYIsNeg) {
														
 
															-            q = biAdd(q, bigOne);
														
 
															-        } else {
														
 
															-            q = biSubtract(q, bigOne);
														
 
															-        }
														
 
															-        y = biShiftRight(y, lambda);
														
 
															-        r = biSubtract(y, r);
														
 
															-    }
														
 
															-    // Check for the unbelievably stupid degenerate case of r == -0.
														
 
															-    if (r.digits[0] == 0 && biHighIndex(r) == 0)
														
 
															-        r.isNeg = false;
														
 
															-
														
 
															-    return new Array(q,r);
														
 
															-}
														
 
															-
														
 
															-function biDivide(x, y) {
														
 
															-    return biDivideModulo(x, y)[0];
														
 
															-}
														
 
															-
														
 
															-function biModulo(x, y) {
														
 
															-    return biDivideModulo(x, y)[1];
														
 
															-}
														
 
															-
														
 
															-function biMultiplyMod(x, y, m) {
														
 
															-    return biModulo(biMultiply(x, y), m);
														
 
															-}
														
 
															-
														
 
															-function biPow(x, y) {
														
 
															-    var result = bigOne;
														
 
															-    var a = x;
														
 
															-    while (true) {
														
 
															-        if ((y & 1) != 0)
														
 
															-            result = biMultiply(result, a);
														
 
															-        y >>= 1;
														
 
															-        if (y == 0)
														
 
															-            break;
														
 
															-        a = biMultiply(a, a);
														
 
															-    }
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-function biPowMod(x, y, m) {
														
 
															-    var result = bigOne;
														
 
															-    var a = x;
														
 
															-    var k = y;
														
 
															-    while (true) {
														
 
															-        if ((k.digits[0] & 1) != 0)
														
 
															-            result = biMultiplyMod(result, a, m);
														
 
															-        k = biShiftRight(k, 1);
														
 
															-        if (k.digits[0] == 0 && biHighIndex(k) == 0)
														
 
															-            break;
														
 
															-        a = biMultiplyMod(a, a, m);
														
 
															-    }
														
 
															-    return result;
														
 
															-}
														
 
															-
														
 
															-module.exports.biFromHex = biFromHex
														
 
															-module.exports.bigInt = BigInt
														
 
															-module.exports.biHighIndex = biHighIndex
														
 
															-module.exports.biCopy = biCopy
														
 
															-module.exports.biHighIndex = biHighIndex
														
 
															-module.exports.biDivide = biDivide
														
--- a/NoteWork/cesspider/js/rsa/RSA.js
+++ b/NoteWork/cesspider/js/rsa/RSA.js
@@ -1,583 +0,0 @@
 
															-/*
														
 
															-* Copyright (c) 2015 Eric Wilde.
														
 
															-* Copyright 1998-2015 David Shapiro.
														
 
															-* 
														
 
															-* RSA.js is a suite of routines for performing RSA public-key computations
														
 
															-* in JavaScript.  The cryptographic functions herein are used for encoding
														
 
															-* and decoding strings to be sent over unsecure channels.
														
 
															-*
														
 
															-* To use these routines, a pair of public/private keys is created through a
														
 
															-* number of means (OpenSSL tools on Linux/Unix, Dave Shapiro's
														
 
															-* RSAKeyGenerator program on Windows).  These keys are passed to RSAKeyPair
														
 
															-* as hexadecimal strings to create an encryption key object.  This key object
														
 
															-* is then used with encryptedString to encrypt blocks of plaintext using the
														
 
															-* public key.  The resulting cyphertext blocks can be decrypted with
														
 
															-* decryptedString.
														
 
															-*
														
 
															-* Note that the cryptographic functions herein are complementary to those
														
 
															-* found in CryptoFuncs.php and CryptoFuncs.pm.  Hence, encrypted messages may
														
 
															-* be sent between programs written in any of those languages.  The most
														
 
															-* useful, of course is to send messages encrypted by a Web page using RSA.js
														
 
															-* to a PHP or Perl script running on a Web servitron.
														
 
															-*
														
 
															-* Also, the optional padding flag may be specified on the call to
														
 
															-* encryptedString, in which case blocks of cyphertext that are compatible
														
 
															-* with real crypto libraries such as OpenSSL or Microsoft will be created.
														
 
															-* These blocks of cyphertext can then be sent to Web servitron that uses one
														
 
															-* of these crypto libraries for decryption.  This allows messages encrypted
														
 
															-* with longer keys to be decrypted quickly on the Web server as well as
														
 
															-* making for more secure communications when a padding algorithm such as
														
 
															-* PKCS1v1.5 is used.
														
 
															-*
														
 
															-* These routines require BigInt.js and Barrett.js.
														
 
															-*/
														
 
															-
														
 
															-/*****************************************************************************/
														
 
															-
														
 
															-/*
														
 
															-* Modifications
														
 
															-* -------------
														
 
															-*
														
 
															-* 2014 Jan 11  E. Wilde       Add optional padding flag to encryptedString
														
 
															-*                             for compatibility with real crypto libraries
														
 
															-*                             such as OpenSSL or Microsoft.  Add PKCS1v1.5
														
 
															-*                             padding.
														
 
															-*
														
 
															-* 2015 Jan 5  D. Shapiro      Add optional encoding flag for encryptedString
														
 
															-*                             and encapsulate padding and encoding constants
														
 
															-*                             in RSAAPP object.
														
 
															-*
														
 
															-* Original Code
														
 
															-* -------------
														
 
															-*
														
 
															-* Copyright 1998-2005 David Shapiro.
														
 
															-*
														
 
															-* You may use, re-use, abuse, copy, and modify this code to your liking, but
														
 
															-* please keep this header.
														
 
															-*
														
 
															-* Thanks!
														
 
															-* 
														
 
															-* Dave Shapiro
														
 
															-* dave@ohdave.com
														
 
															-*/
														
 
															-
														
 
															-/*****************************************************************************/
														
 
															-const BigInt = require("./BigInt");
														
 
															-
														
 
															-const Barrett = require("./Barrett");
														
 
															-// const bigInt = require("./BigInt");
														
 
															-// console.log(bigInt,biFromHex)
														
 
															-var bigInt = BigInt.bigInt
														
 
															-var biFromHex = BigInt.biFromHex
														
 
															-var biHighIndex = BigInt.biHighIndex
														
 
															-var BarrettMu = Barrett.BarrettMu
														
 
															-var RSAAPP = {};
														
 
															-
														
 
															-RSAAPP.NoPadding = "NoPadding";
														
 
															-RSAAPP.PKCS1Padding = "PKCS1Padding";
														
 
															-RSAAPP.RawEncoding = "RawEncoding";
														
 
															-RSAAPP.NumericEncoding = "NumericEncoding"
														
 
															-
														
 
															-/*****************************************************************************/
														
 
															-
														
 
															-function RSAKeyPair(encryptionExponent, decryptionExponent, modulus, keylen)
														
 
															-/*
														
 
															-* encryptionExponent                    The encryption exponent (i.e. public
														
 
															-*                                       encryption key) to be used for
														
 
															-*                                       encrypting messages.  If you aren't
														
 
															-*                                       doing any encrypting, a dummy
														
 
															-*                                       exponent such as "10001" can be
														
 
															-*                                       passed.
														
 
															-*
														
 
															-* decryptionExponent                    The decryption exponent (i.e. private
														
 
															-*                                       decryption key) to be used for
														
 
															-*                                       decrypting messages.  If you aren't
														
 
															-*                                       doing any decrypting, a dummy
														
 
															-*                                       exponent such as "10001" can be
														
 
															-*                                       passed.
														
 
															-*
														
 
															-* modulus                               The modulus to be used both for
														
 
															-*                                       encrypting and decrypting messages.
														
 
															-*
														
 
															-* keylen                                The optional length of the key, in
														
 
															-*                                       bits.  If omitted, RSAKeyPair will
														
 
															-*                                       attempt to derive a key length (but,
														
 
															-*                                       see the notes below).
														
 
															-*
														
 
															-* returns                               The "new" object creator returns an
														
 
															-*                                       instance of a key object that can be
														
 
															-*                                       used to encrypt/decrypt messages.
														
 
															-*
														
 
															-* This routine is invoked as the first step in the encryption or decryption
														
 
															-* process to take the three numbers (expressed as hexadecimal strings) that
														
 
															-* are used for RSA asymmetric encryption/decryption and turn them into a key
														
 
															-* object that can be used for encrypting and decrypting.
														
 
															-*
														
 
															-* The key object is created thusly:
														
 
															-*
														
 
															-*      RSAKey = new RSAKeyPair("ABC12345", 10001, "987654FE");
														
 
															-*
														
 
															-* or:
														
 
															-*
														
 
															-*      RSAKey = new RSAKeyPair("ABC12345", 10001, "987654FE", 64);
														
 
															-*
														
 
															-* Note that RSAKeyPair will try to derive the length of the key that is being
														
 
															-* used, from the key itself.  The key length is especially useful when one of
														
 
															-* the padding options is used and/or when the encrypted messages created by
														
 
															-* the routine encryptedString are exchanged with a real crypto library such
														
 
															-* as OpenSSL or Microsoft, as it determines how many padding characters are
														
 
															-* appended.
														
 
															-*
														
 
															-* Usually, RSAKeyPair can determine the key length from the modulus of the
														
 
															-* key but this doesn't always work properly, depending on the actual value of
														
 
															-* the modulus.  If you are exchanging messages with a real crypto library,
														
 
															-* such as OpenSSL or Microsoft, that depends on the fact that the blocks
														
 
															-* being passed to it are properly padded, you'll want the key length to be
														
 
															-* set properly.  If that's the case, of if you just want to be sure, you
														
 
															-* should specify the key length that you used to generated the key, in bits
														
 
															-* when this routine is invoked.
														
 
															-*/
														
 
															-{
														
 
															-/*
														
 
															-* Convert from hexadecimal and save the encryption/decryption exponents and
														
 
															-* modulus as big integers in the key object.
														
 
															-*/
														
 
															-this.e = biFromHex(encryptionExponent);
														
 
															-this.d = biFromHex(decryptionExponent);
														
 
															-this.m = biFromHex(modulus);
														
 
															-/*
														
 
															-* Using big integers, we can represent two bytes per element in the big
														
 
															-* integer array, so we calculate the chunk size as:
														
 
															-*
														
 
															-*      chunkSize = 2 * (number of digits in modulus - 1)
														
 
															-*
														
 
															-* Since biHighIndex returns the high index, not the number of digits, the
														
 
															-* number 1 has already been subtracted from its answer.
														
 
															-*
														
 
															-* However, having said all this, "User Knows Best".  If our caller passes us
														
 
															-* a key length (in bits), we'll treat it as gospel truth.
														
 
															-*/
														
 
															-if (typeof(keylen) != 'number') { this.chunkSize = 2 * biHighIndex(this.m); }
														
 
															-else { this.chunkSize = keylen / 8; }
														
 
															-
														
 
															-this.radix = 16;
														
 
															-/*
														
 
															-* Precalculate the stuff used for Barrett modular reductions.
														
 
															-*/
														
 
															-this.barrett = new BarrettMu(this.m);
														
 
															-}
														
 
															-
														
 
															-/*****************************************************************************/
														
 
															-
														
 
															-function encryptedString(key, s, pad, encoding)
														
 
															-/*
														
 
															-* key                                   The previously-built RSA key whose
														
 
															-*                                       public key component is to be used to
														
 
															-*                                       encrypt the plaintext string.
														
 
															-*
														
 
															-* s                                     The plaintext string that is to be
														
 
															-*                                       encrypted, using the RSA assymmetric
														
 
															-*                                       encryption method.
														
 
															-*
														
 
															-* pad                                   The optional padding method to use
														
 
															-*                                       when extending the plaintext to the
														
 
															-*                                       full chunk size required by the RSA
														
 
															-*                                       algorithm.  To maintain compatibility
														
 
															-*                                       with other crypto libraries, the
														
 
															-*                                       padding method is described by a
														
 
															-*                                       string.  The default, if not
														
 
															-*                                       specified is "OHDave".  Here are the
														
 
															-*                                       choices:
														
 
															-*
														
 
															-*                                         OHDave - this is the original
														
 
															-*                                           padding method employed by Dave
														
 
															-*                                           Shapiro and Rob Saunders.  If
														
 
															-*                                           this method is chosen, the
														
 
															-*                                           plaintext can be of any length.
														
 
															-*                                           It will be padded to the correct
														
 
															-*                                           length with zeros and then broken
														
 
															-*                                           up into chunks of the correct
														
 
															-*                                           length before being encrypted.
														
 
															-*                                           The resultant cyphertext blocks
														
 
															-*                                           will be separated by blanks.
														
 
															-*
														
 
															-*                                           Note that the original code by
														
 
															-*                                           Dave Shapiro reverses the byte
														
 
															-*                                           order to little-endian, as the
														
 
															-*                                           plaintext is encrypted.  If
														
 
															-*                                           either these JavaScript routines
														
 
															-*                                           or one of the complementary
														
 
															-*                                           PHP/Perl routines derived from
														
 
															-*                                           this code is used for decryption,
														
 
															-*                                           the byte order will be reversed
														
 
															-*                                           again upon decryption so as to
														
 
															-*                                           come out correctly.
														
 
															-*                                           
														
 
															-*                                           Also note that this padding
														
 
															-*                                           method is claimed to be less
														
 
															-*                                           secure than PKCS1Padding.
														
 
															-*
														
 
															-*                                         NoPadding - this method truncates
														
 
															-*                                           the plaintext to the length of
														
 
															-*                                           the RSA key, if it is longer.  If
														
 
															-*                                           its length is shorter, it is
														
 
															-*                                           padded with zeros.  In either
														
 
															-*                                           case, the plaintext string is
														
 
															-*                                           reversed to preserve big-endian
														
 
															-*                                           order before it is encrypted to
														
 
															-*                                           maintain compatibility with real
														
 
															-*                                           crypto libraries such as OpenSSL
														
 
															-*                                           or Microsoft.  When the
														
 
															-*                                           cyphertext is to be decrypted
														
 
															-*                                           by a crypto library, the
														
 
															-*                                           library routine's RSAAPP.NoPadding
														
 
															-*                                           flag, or its equivalent, should
														
 
															-*                                           be used.
														
 
															-*
														
 
															-*                                           Note that this padding method is
														
 
															-*                                           claimed to be less secure than
														
 
															-*                                           PKCS1Padding.
														
 
															-*
														
 
															-*                                         PKCS1Padding - the PKCS1v1.5
														
 
															-*                                           padding method (as described in
														
 
															-*                                           RFC 2313) is employed to pad the
														
 
															-*                                           plaintext string.  The plaintext
														
 
															-*                                           string must be no longer than the
														
 
															-*                                           length of the RSA key minus 11,
														
 
															-*                                           since PKCS1v1.5 requires 3 bytes
														
 
															-*                                           of overhead and specifies a
														
 
															-*                                           minimum pad of 8 bytes.  The
														
 
															-*                                           plaintext string is padded with
														
 
															-*                                           randomly-generated bytes and then
														
 
															-*                                           its order is reversed to preserve
														
 
															-*                                           big-endian order before it is
														
 
															-*                                           encrypted to maintain
														
 
															-*                                           compatibility with real crypto
														
 
															-*                                           libraries such as OpenSSL or
														
 
															-*                                           Microsoft.  When the cyphertext
														
 
															-*                                           is to be decrypted by a crypto
														
 
															-*                                           library, the library routine's
														
 
															-*                                           RSAAPP.PKCS1Padding flag, or its
														
 
															-*                                           equivalent, should be used.
														
 
															-*
														
 
															-* encoding                              The optional encoding scheme to use
														
 
															-*                                       for the return value. If ommitted,
														
 
															-*                                       numeric encoding will be used.
														
 
															-*
														
 
															-*                                           RawEncoding - The return value
														
 
															-*                                           is given as its raw value.
														
 
															-*                                           This is the easiest method when
														
 
															-*                                           interoperating with server-side
														
 
															-*                                           OpenSSL, as no additional conversion
														
 
															-*                                           is required. Use the constant
														
 
															-*                                           RSAAPP.RawEncoding for this option.
														
 
															-*
														
 
															-*                                           NumericEncoding - The return value
														
 
															-*                                           is given as a number in hexadecimal.
														
 
															-*                                           Perhaps useful for debugging, but
														
 
															-*                                           will need to be translated back to
														
 
															-*                                           its raw equivalent (e.g. using
														
 
															-*                                           PHP's hex2bin) before using with
														
 
															-*                                           OpenSSL. Use the constant
														
 
															-*                                           RSAAPP.NumericEncoding for this option.
														
 
															-*
														
 
															-* returns                               The cyphertext block that results
														
 
															-*                                       from encrypting the plaintext string
														
 
															-*                                       s with the RSA key.
														
 
															-*
														
 
															-* This routine accepts a plaintext string that is to be encrypted with the
														
 
															-* public key component of the previously-built RSA key using the RSA
														
 
															-* assymmetric encryption method.  Before it is encrypted, the plaintext
														
 
															-* string is padded to the same length as the encryption key for proper
														
 
															-* encryption.
														
 
															-*
														
 
															-* Depending on the padding method chosen, an optional header with block type
														
 
															-* is prepended, the plaintext is padded using zeros or randomly-generated
														
 
															-* bytes, and then the plaintext is possibly broken up into chunks.
														
 
															-*
														
 
															-* Note that, for padding with zeros, this routine was altered by Rob Saunders
														
 
															-* (rob@robsaunders.net). The new routine pads the string after it has been
														
 
															-* converted to an array. This fixes an incompatibility with Flash MX's
														
 
															-* ActionScript.
														
 
															-*
														
 
															-* The various padding schemes employed by this routine, and as presented to
														
 
															-* RSA for encryption, are shown below.  Note that the RSA encryption done
														
 
															-* herein reverses the byte order as encryption is done:
														
 
															-*
														
 
															-*      Plaintext In
														
 
															-*      ------------
														
 
															-*
														
 
															-*      d5 d4 d3 d2 d1 d0
														
 
															-*
														
 
															-*      OHDave
														
 
															-*      ------
														
 
															-*
														
 
															-*      d5 d4 d3 d2 d1 d0 00 00 00 /.../ 00 00 00 00 00 00 00 00
														
 
															-*
														
 
															-*      NoPadding
														
 
															-*      ---------
														
 
															-*
														
 
															-*      00 00 00 00 00 00 00 00 00 /.../ 00 00 d0 d1 d2 d3 d4 d5
														
 
															-*
														
 
															-*      PKCS1Padding
														
 
															-*      ------------
														
 
															-*
														
 
															-*      d0 d1 d2 d3 d4 d5 00 p0 p1 /.../ p2 p3 p4 p5 p6 p7 02 00
														
 
															-*                            \------------  ------------/
														
 
															-*                                         \/
														
 
															-*                             Minimum 8 bytes pad length
														
 
															-*/
														
 
															-{
														
 
															-var a = new Array();                    // The usual Alice and Bob stuff
														
 
															-var sl = s.length;                      // Plaintext string length
														
 
															-var i, j, k;                            // The usual Fortran index stuff
														
 
															-var padtype;                            // Type of padding to do
														
 
															-var encodingtype;                       // Type of output encoding
														
 
															-var rpad;                               // Random pad
														
 
															-var al;                                 // Array length
														
 
															-var result = "";                        // Cypthertext result
														
 
															-var block;                              // Big integer block to encrypt
														
 
															-var crypt;                              // Big integer result
														
 
															-var text;                               // Text result
														
 
															-/*
														
 
															-* Figure out the padding type.
														
 
															-*/
														
 
															-if (typeof(pad) == 'string') {
														
 
															-  if (pad == RSAAPP.NoPadding) { padtype = 1; }
														
 
															-  else if (pad == RSAAPP.PKCS1Padding) { padtype = 2; }
														
 
															-  else { padtype = 0; }
														
 
															-}
														
 
															-else { padtype = 0; }
														
 
															-/*
														
 
															-* Determine encoding type.
														
 
															-*/
														
 
															-if (typeof(encoding) == 'string' && encoding == RSAAPP.RawEncoding) {
														
 
															-	encodingtype = 1;
														
 
															-}
														
 
															-else { encodingtype = 0; }
														
 
															-
														
 
															-/*
														
 
															-* If we're not using Dave's padding method, we need to truncate long
														
 
															-* plaintext blocks to the correct length for the padding method used:
														
 
															-*
														
 
															-*       NoPadding    - key length
														
 
															-*       PKCS1Padding - key length - 11
														
 
															-*/
														
 
															-if (padtype == 1) {
														
 
															-  if (sl > key.chunkSize) { sl = key.chunkSize; }
														
 
															-}
														
 
															-else if (padtype == 2) {
														
 
															-  if (sl > (key.chunkSize-11)) { sl = key.chunkSize - 11; }
														
 
															-}
														
 
															-/*
														
 
															-* Convert the plaintext string to an array of characters so that we can work
														
 
															-* with individual characters.
														
 
															-*
														
 
															-* Note that, if we're talking to a real crypto library at the other end, we
														
 
															-* reverse the plaintext order to preserve big-endian order.
														
 
															-*/
														
 
															-i = 0;
														
 
															-
														
 
															-if (padtype == 2) { j = sl - 1; }
														
 
															-else { j = key.chunkSize - 1; }
														
 
															-
														
 
															-while (i < sl) {
														
 
															-  if (padtype) { a[j] = s.charCodeAt(i); }
														
 
															-  else { a[i] = s.charCodeAt(i); }
														
 
															-
														
 
															-  i++; j--;
														
 
															-}
														
 
															-/*
														
 
															-* Now is the time to add the padding.
														
 
															-*
														
 
															-* If we're doing PKCS1v1.5 padding, we pick up padding where we left off and
														
 
															-* pad the remainder of the block.  Otherwise, we pad at the front of the
														
 
															-* block.  This gives us the correct padding for big-endian blocks.
														
 
															-*
														
 
															-* The padding is either a zero byte or a randomly-generated non-zero byte.
														
 
															-*/
														
 
															-if (padtype == 1) { i = 0; }
														
 
															-
														
 
															-j = key.chunkSize - (sl % key.chunkSize);
														
 
															-
														
 
															-while (j > 0) {
														
 
															-  if (padtype == 2) {
														
 
															-    rpad = Math.floor(Math.random() * 256);
														
 
															-
														
 
															-    while (!rpad) { rpad = Math.floor(Math.random() * 256); }
														
 
															-
														
 
															-    a[i] = rpad;
														
 
															-  }
														
 
															-  else { a[i] = 0; }
														
 
															-
														
 
															-  i++; j--;
														
 
															-}
														
 
															-/*
														
 
															-* For PKCS1v1.5 padding, we need to fill in the block header.
														
 
															-*
														
 
															-* According to RFC 2313, a block type, a padding string, and the data shall
														
 
															-* be formatted into the encryption block:
														
 
															-*
														
 
															-*      EncrBlock = 00 || BlockType || PadString || 00 || Data
														
 
															-*
														
 
															-* The block type shall be a single octet indicating the structure of the
														
 
															-* encryption block. For this version of the document it shall have value 00,
														
 
															-* 01, or 02. For a private-key operation, the block type shall be 00 or 01.
														
 
															-* For a public-key operation, it shall be 02.
														
 
															-*
														
 
															-* The padding string shall consist of enough octets to pad the encryption
														
 
															-* block to the length of the encryption key.  For block type 00, the octets
														
 
															-* shall have value 00; for block type 01, they shall have value FF; and for
														
 
															-* block type 02, they shall be pseudorandomly generated and nonzero.
														
 
															-*
														
 
															-* Note that in a previous step, we wrote padding bytes into the first three
														
 
															-* bytes of the encryption block because it was simpler to do so.  We now
														
 
															-* overwrite them.
														
 
															-*/
														
 
															-if (padtype == 2)
														
 
															-  {
														
 
															-  a[sl] = 0;
														
 
															-  a[key.chunkSize-2] = 2;
														
 
															-  a[key.chunkSize-1] = 0;
														
 
															-  }
														
 
															-/*
														
 
															-* Carve up the plaintext and encrypt each of the resultant blocks.
														
 
															-*/
														
 
															-al = a.length;
														
 
															-
														
 
															-for (i = 0; i < al; i += key.chunkSize) {
														
 
															-  /*
														
 
															-  * Get a block.
														
 
															-  */
														
 
															-  block = new bigInt();
														
 
															-
														
 
															-  j = 0;
														
 
															-
														
 
															-  for (k = i; k < (i+key.chunkSize); ++j) {
														
 
															-    block.digits[j] = a[k++];
														
 
															-    block.digits[j] += a[k++] << 8;
														
 
															-  }
														
 
															-  /*
														
 
															-  * Encrypt it, convert it to text, and append it to the result.
														
 
															-  */
														
 
															-  crypt = key.barrett.powMod(block, key.e);
														
 
															-  if (encodingtype == 1) {
														
 
															-	  text = biToBytes(crypt);
														
 
															-  }
														
 
															-  else {
														
 
															-	  text = (key.radix == 16) ? biToHex(crypt) : biToString(crypt, key.radix);
														
 
															-  }
														
 
															-  result += text;
														
 
															-}
														
 
															-/*
														
 
															-* Return the result, removing the last space.
														
 
															-*/
														
 
															-//result = (result.substring(0, result.length - 1));
														
 
															-return result;
														
 
															-}
														
 
															-
														
 
															-/*****************************************************************************/
														
 
															-
														
 
															-function decryptedString(key, c)
														
 
															-/*
														
 
															-* key                                   The previously-built RSA key whose
														
 
															-*                                       private key component is to be used
														
 
															-*                                       to decrypt the cyphertext string.
														
 
															-*
														
 
															-* c                                     The cyphertext string that is to be
														
 
															-*                                       decrypted, using the RSA assymmetric
														
 
															-*                                       encryption method.
														
 
															-*
														
 
															-* returns                               The plaintext block that results from
														
 
															-*                                       decrypting the cyphertext string c
														
 
															-*                                       with the RSA key.
														
 
															-*
														
 
															-* This routine is the complementary decryption routine that is meant to be
														
 
															-* used for JavaScript decryption of cyphertext blocks that were encrypted
														
 
															-* using the OHDave padding method of the encryptedString routine (in this
														
 
															-* module).  It can also decrypt cyphertext blocks that were encrypted by
														
 
															-* RSAEncode (in CryptoFuncs.pm or CryptoFuncs.php) so that encrypted
														
 
															-* messages can be sent of insecure links (e.g. HTTP) to a Web page.
														
 
															-*
														
 
															-* It accepts a cyphertext string that is to be decrypted with the public key
														
 
															-* component of the previously-built RSA key using the RSA assymmetric
														
 
															-* encryption method.  Multiple cyphertext blocks are broken apart, if they
														
 
															-* are found in c, and each block is decrypted.  All of the decrypted blocks
														
 
															-* are concatenated back together to obtain the original plaintext string.
														
 
															-*
														
 
															-* This routine assumes that the plaintext was padded to the same length as
														
 
															-* the encryption key with zeros.  Therefore, it removes any zero bytes that
														
 
															-* are found at the end of the last decrypted block, before it is appended to
														
 
															-* the decrypted plaintext string.
														
 
															-*
														
 
															-* Note that the encryptedString routine (in this module) works fairly quickly
														
 
															-* simply by virtue of the fact that the public key most often chosen is quite
														
 
															-* short (e.g. 0x10001).  This routine does not have that luxury.  The
														
 
															-* decryption key that it must employ is the full key length.  For long keys,
														
 
															-* this can result in serious timing delays (e.g. 7-8 seconds to decrypt using
														
 
															-* 2048 bit keys on a reasonably fast machine, under the Firefox Web browser).
														
 
															-*
														
 
															-* If you intend to send encrypted messagess to a JavaScript program running
														
 
															-* under a Web browser, you might consider using shorter keys to keep the
														
 
															-* decryption times low.  Alternately, a better scheme is to generate a random
														
 
															-* key for use by a symmetric encryption algorithm and transmit it to the
														
 
															-* other end, after encrypting it with encryptedString.  The other end can use
														
 
															-* a real crypto library (e.g. OpenSSL or Microsoft) to decrypt the key and
														
 
															-* then use it to encrypt all of the messages (with a symmetric encryption
														
 
															-* algorithm such as Twofish or AES) bound for the JavaScript program.
														
 
															-* Symmetric decryption is orders of magnitude faster than asymmetric and
														
 
															-* should yield low decryption times, even when executed in JavaScript.
														
 
															-*
														
 
															-* Also note that only the OHDave padding method (e.g. zeros) is supported by
														
 
															-* this routine *AND* that this routine expects little-endian cyphertext, as
														
 
															-* created by the encryptedString routine (in this module) or the RSAEncode
														
 
															-* routine (in either CryptoFuncs.pm or CryptoFuncs.php).  You can use one of
														
 
															-* the real crypto libraries to create cyphertext that can be decrypted by
														
 
															-* this routine, if you reverse the plaintext byte order first and then
														
 
															-* manually pad it with zero bytes.  The plaintext should then be encrypted
														
 
															-* with the NoPadding flag or its equivalent in the crypto library of your
														
 
															-* choice.
														
 
															-*/
														
 
															-{
														
 
															-var blocks = c.split(" ");              // Multiple blocks of cyphertext
														
 
															-var b;                                  // The usual Alice and Bob stuff
														
 
															-var i, j;                               // The usual Fortran index stuff
														
 
															-var bi;                                 // Cyphertext as a big integer
														
 
															-var result = "";                        // Plaintext result
														
 
															-/*
														
 
															-* Carve up the cyphertext into blocks.
														
 
															-*/
														
 
															-for (i = 0; i < blocks.length; ++i) {
														
 
															-  /*
														
 
															-  * Depending on the radix being used for the key, convert this cyphertext
														
 
															-  * block into a big integer.
														
 
															-  */
														
 
															-  if (key.radix == 16) { bi = biFromHex(blocks[i]); }
														
 
															-  else { bi = biFromString(blocks[i], key.radix); }
														
 
															-  /*
														
 
															-  * Decrypt the cyphertext.
														
 
															-  */
														
 
															-  b = key.barrett.powMod(bi, key.d);
														
 
															-  /*
														
 
															-  * Convert the decrypted big integer back to the plaintext string.  Since
														
 
															-  * we are using big integers, each element thereof represents two bytes of
														
 
															-  * plaintext.
														
 
															-  */
														
 
															-  for (j = 0; j <= biHighIndex(b); ++j) {
														
 
															-    result += String.fromCharCode(b.digits[j] & 255, b.digits[j] >> 8);
														
 
															-  }
														
 
															-}
														
 
															-/*
														
 
															-* Remove trailing null, if any.
														
 
															-*/
														
 
															-if (result.charCodeAt(result.length - 1) == 0) {
														
 
															-  result = result.substring(0, result.length - 1);
														
 
															-}
														
 
															-/*
														
 
															-* Return the plaintext.
														
 
															-*/
														
 
															-return (result);
														
 
															-}
														
 
															-
														
 
															-// export {RSAKeyPair}
														
 
															-module.exports = RSAKeyPair;
														
--- a/NoteWork/cesspider/js/rsa/__init__.py
+++ b/NoteWork/cesspider/js/rsa/__init__.py
--- a/NoteWork/cesspider/magpces.py
+++ b/NoteWork/cesspider/magpces.py
@@ -1,109 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2018-09-06 14:21
														
 
															----------
														
 
															-@summary: 工具
														
 
															----------
														
 
															-@author: Boris
														
 
															-@email: boris_liu@foxmail.com
														
 
															-"""
														
 
															-import datetime
														
 
															-import json
														
 
															-import re
														
 
															-from pprint import pformat
														
 
															-print('sssssssss')
														
 
															-_regexs = {}
														
 
															-def get_info(html, regexs, allow_repeat=True, fetch_one=False, split=None):
														
 
															-    regexs = isinstance(regexs, str) and [regexs] or regexs
														
 
															-
														
 
															-    infos = []
														
 
															-    for regex in regexs:
														
 
															-        if regex == "":
														
 
															-            continue
														
 
															-
														
 
															-        if regex not in _regexs.keys():
														
 
															-            _regexs[regex] = re.compile(regex, re.S)
														
 
															-
														
 
															-        if fetch_one:
														
 
															-            infos = _regexs[regex].search(html)
														
 
															-            if infos:
														
 
															-                infos = infos.groups()
														
 
															-            else:
														
 
															-                continue
														
 
															-        else:
														
 
															-            infos = _regexs[regex].findall(str(html))
														
 
															-
														
 
															-        if len(infos) > 0:
														
 
															-            # print(regex)
														
 
															-            break
														
 
															-
														
 
															-    if fetch_one:
														
 
															-        infos = infos if infos else ("",)
														
 
															-        return infos if len(infos) > 1 else infos[0]
														
 
															-    else:
														
 
															-        infos = allow_repeat and infos or sorted(set(infos), key=infos.index)
														
 
															-        infos = split.join(infos) if split else infos
														
 
															-        return infos
														
 
															-def get_json(json_str):
														
 
															-    """
														
 
															-    @summary: 取json对象
														
 
															-    ---------
														
 
															-    @param json_str: json格式的字符串
														
 
															-    ---------
														
 
															-    @result: 返回json对象
														
 
															-    """
														
 
															-
														
 
															-    try:
														
 
															-        return json.loads(json_str) if json_str else {}
														
 
															-    except Exception as e1:
														
 
															-        try:
														
 
															-            json_str = json_str.strip()
														
 
															-            json_str = json_str.replace("'", '"')
														
 
															-            keys = get_info(json_str, "(\w+):")
														
 
															-            for key in keys:
														
 
															-                json_str = json_str.replace(key, '"%s"' % key)
														
 
															-
														
 
															-            return json.loads(json_str) if json_str else {}
														
 
															-
														
 
															-        except Exception as e2:
														
 
															-            print(
														
 
															-                """
														
 
															-                e1: %s
														
 
															-                format json_str: %s
														
 
															-                e2: %s
														
 
															-                """
														
 
															-                % (e1, json_str, e2)
														
 
															-            )
														
 
															-
														
 
															-        return {}
														
 
															-
														
 
															-
														
 
															-def dumps_json(json_, indent=4, sort_keys=False):
														
 
															-    """
														
 
															-    @summary: 格式化json 用于打印
														
 
															-    ---------
														
 
															-    @param json_: json格式的字符串或json对象
														
 
															-    ---------
														
 
															-    @result: 格式化后的字符串
														
 
															-    """
														
 
															-    try:
														
 
															-        if isinstance(json_, str):
														
 
															-            json_ = get_json(json_)
														
 
															-
														
 
															-        json_ = json.dumps(
														
 
															-            json_, ensure_ascii=False, indent=indent, skipkeys=True, sort_keys=sort_keys
														
 
															-        )
														
 
															-
														
 
															-    except Exception as e:
														
 
															-        print(e)
														
 
															-        json_ = pformat(json_)
														
 
															-
														
 
															-    return json_
														
 
															-def get_current_date(date_format="%Y-%m-%d %H:%M:%S"):
														
 
															-    return datetime.datetime.now().strftime(date_format)
														
 
															-    # return time.strftime(date_format, time.localtime(time.time()))
														
 
															-def key2hump(key):
														
 
															-    """
														
 
															-    下划线试变成首字母大写
														
 
															-    """
														
 
															-    return key.title().replace("_", "")
														
--- a/NoteWork/cesspider/中国南方电网电子采购交易平台.py
+++ b/NoteWork/cesspider/中国南方电网电子采购交易平台.py
@@ -1,95 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-22 11:13:05
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import json
														
 
															-
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-from feapder.utils.tools import timestamp_to_date
														
 
															-
														
 
															-class Zgnfdzcgjypt(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         self.count = 0
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-
														
 
															-         self.menus = [
														
 
															-             # Menu('Zgnfdzcgjypt', 'Zgnfdzcgjypt', "Notice", 1),
														
 
															-             Menu('招标采购公告', 'a_zgnfdwdzcgjypt_zbcggg', "Notice", 3),
														
 
															-         ]
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        for menu in self.menus:
														
 
															-            start_url = "https://ecsg.com.cn/api/tender/tendermanage/gatewayNoticeQueryController/queryGatewayNoticeListPagination"
														
 
															-            for page in range(menu.crawl_page):
														
 
															-                data = {
														
 
															-                    "noticeTitle": "",
														
 
															-                    "publishTime": "",
														
 
															-                    "organizationInfoName": "",
														
 
															-                    "pageNo": page + 1,
														
 
															-                    "pageSize": 20
														
 
															-                }
														
 
															-                data = json.dumps(data)
														
 
															-                yield feapder.Request(url=start_url, item=menu._asdict(), method='POST', data=data)
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        # print(response.text)
														
 
															-        menu = request.item
														
 
															-        self.count += 1   # 一个计数器
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.json.get("list")
														
 
															-        for info in info_list:
														
 
															-            # href = ''
														
 
															-            title = info.get("noticeTitle")
														
 
															-            create_time = info.get("publishTime")
														
 
															-            create_time = timestamp_to_date(int(create_time/1000), time_format="%Y-%m-%d %H:%M:%S")
														
 
															-            href = f'https://ecsg.com.cn/cms/NoticeDetail.html?objectId={info.get("objectId")}&objectType={info.get("objectType")}&typeid=4'
														
 
															-            data = {"objectId": info.get('objectId'), "objectType": "1"}
														
 
															-            data = json.dumps(data)
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = "中国南方电网电子采购交易平台"
														
 
															-            data_item.area = "全国"  # 城市默认:全国
														
 
															-            data_item.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_json"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.request_params = {
														
 
															-                "data":data,"method":"POST"}
														
 
															-            list_item.deal_detail = '''
														
 
															-html = response.json.get("noticeContent")
														
 
															-            '''
														
 
															-            list_item.author = "马国鹏"
														
 
															-            list_item.parse_url = "https://ecsg.com.cn/api/tender/tendermanage/gatewayNoticeQueryController/getNotice"
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-    def download_midware(self, request):
														
 
															-        request.headers = {
														
 
															-            "Content-Type": "application/json;charset=UTF-8",
														
 
															-            "Referer": "https://ecsg.com.cn/cms/NoticeList.html?id=159&typeid=4&word=&seacrhDate=",
														
 
															-            "Accept-Language": "zh-CN,zh;q=0.9"
														
 
															-        }
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Zgnfdzcgjypt(redis_key="fwork:Zgnfdzcgjypt").start()
														
--- a/NoteWork/cesspider/中国鲁班商务委.py
+++ b/NoteWork/cesspider/中国鲁班商务委.py
@@ -1,133 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-01-20 13:49:04
														
 
															----------
														
 
															-@summary: Zglbsww
														
 
															----------
														
 
															-@author: dist
														
 
															-"""
														
 
															-import json
														
 
															-import sys
														
 
															-
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem,ListItem
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-
														
 
															-class Zglbsww(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'purchaseType',"orders", 'crawl_page'])
														
 
															-         self.site= "中铁鲁班商务网"
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('公告补遗-招标采购', 'a_ztlbsww_zhbgg', "CRFQ","publish_time", 1),
														
 
															-             Menu('公告补遗-询价采购', 'a_ztlbsww_ggby_xjcg', "XJFQ","publish_time", 1),
														
 
															-             Menu('公告补遗-竞争性谈判', 'a_ztlbsww_cqby', "TPFQ","publish_time", 1),
														
 
															-             Menu('公告补遗-竞价采购', 'a_ztlbsww_ggby_jjcg', "JJFQ","publish_time", 1),
														
 
															-
														
 
															-             Menu('采购公告-招标采购', 'a_ztlbsww_zbgg', "CRFQ","pub_time", 1),
														
 
															-             Menu('采购公告-询价采购', 'a_ztlbsww_lsxjcg', "XJFQ","pub_time", 1),
														
 
															-             Menu('采购公告-竞争性谈判', 'a_ztlbsww_jzxtp', "TPFQ","pub_time", 1),
														
 
															-             Menu('采购公告-竞价采购', 'a_ztlbsww_jjcg', "JJFQ","pub_time", 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-             for page in range(1,menu.crawl_page+1):
														
 
															-                 '''
														
 
															-                 https://eproport.crecgec.com/epu-portal/portal/project/listWithPage
														
 
															-                 https://eproport.crecgec.com/epu-portal/portal/project/listWithPage
														
 
															-                 '''
														
 
															-                 start_url = f'https://eproport.crecgec.com/epu-portal/portal/project/listWithPage'
														
 
															-                 data = {
														
 
															-                     "timeType": "month",
														
 
															-                     "areaCode": "-1",
														
 
															-                     "mainType": "-1",
														
 
															-                     "purchaser": None,
														
 
															-                     "information": None,
														
 
															-                     "sTime": "",
														
 
															-                     "eTime": "",
														
 
															-                     "classify": "-1",
														
 
															-                     "region": "-1",
														
 
															-                     "level": "",
														
 
															-                     "selectedState": "",
														
 
															-                     "purchaseType": menu.purchaseType,
														
 
															-                     "noticeType": 1,
														
 
															-                     "orders": menu.orders,
														
 
															-                     "dirs": "desc",
														
 
															-                     "current": page,
														
 
															-                     "size": 10,
														
 
															-                     "page": {}
														
 
															-                 }
														
 
															-                 data = json.dumps(data)
														
 
															-
														
 
															-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False,method="POST",data=data)
														
 
															-    def parse(self, request, response):
														
 
															-        print(response.text)
														
 
															-        menu = request.item
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.json.get("data").get("records")
														
 
															-        for info in info_list:
														
 
															-            projectid = info.get("projectId")
														
 
															-            tenantid = info.get("tenantId")
														
 
															-            href = f'https://eproport.crecgec.com/#/notice/noticexj-detail?projectId={projectid}&tenantId={tenantid}'
														
 
															-            title = info.get("projectName")
														
 
															-            create_time = info.get("publishTime") + ":00"
														
 
															-            area = "全国"  # 省份
														
 
															-            city = ""  # 城市
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = self.site
														
 
															-            data_item.area = area  # 城市默认:全国
														
 
															-            data_item.city = city  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details_ztlbw"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//*']
														
 
															-            list_item.proxies = False
														
 
															-            list_item.render_time = 3
														
 
															-            list_item.parse_url = href
														
 
															-            list_item.pri = 1
														
 
															-            list_item.files={
														
 
															-                "list_xpath":'//div[@class="****"]/a',
														
 
															-                "url_xpath":'./@href',
														
 
															-                "name_xpath":'./text()',
														
 
															-                "files_type":('zip','doxc','ftp'), # 需要下载的附件类型
														
 
															-                # "file_type":'zip', # 默认的附件类型，用于url中未带附件类型的
														
 
															-                "url_key":'http', # 用于区别连接是否为正常附件连接的url关键词 必须携带，如无可填http
														
 
															-                # "host":'http://www.ceshi.com',  # 需要拼接url的host
														
 
															-            }
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        list = ListItem()
														
 
															-        list.site = self.site
														
 
															-        list.channel = menu.get("channel")
														
 
															-        list.spidercode = menu.get("code")
														
 
															-        list.url = request.url
														
 
															-        list.count = len(info_list)
														
 
															-        list.rel_count = len(href_list)
														
 
															-        yield list
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-    def download_midware(self, request):
														
 
															-        request.headers = {
														
 
															-
														
 
															-            "Content-Type": "application/json"
														
 
															-        }
														
 
															-if __name__ == "__main__":
														
 
															-    Zglbsww(redis_key="dist:Zglbsww").start()
														
--- a/NoteWork/cesspider/交通银行供应商门户.py
+++ b/NoteWork/cesspider/交通银行供应商门户.py
@@ -1,76 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-22 10:30:30
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem
														
 
															-from untils.proxy_pool import ProxyPool
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-
														
 
															-class Jtyhgysmh(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         self.count = 0
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-
														
 
															-         self.menus = [
														
 
															-             # Menu('Jtyhgysmh', 'Jtyhgysmh', "Notice", 1),
														
 
															-             Menu('Jtyhgysmh', 'Jtyhgysmh', "Notice", 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-            start_url = f'https://bocom-gys.bankcomm.com/espuser/register/noticePage'
														
 
															-            yield feapder.Request(url=start_url, item=menu._asdict(),render=True,render_time=2)
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        print(response.text)
														
 
															-        import pdb
														
 
															-        pdb.set_trace()
														
 
															-        driver = response.browser
														
 
															-        driver.refresh()
														
 
															-        # driver.get(start_url)
														
 
															-        menu = request.item
														
 
															-        self.count += 1   # 一个计数器
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = []
														
 
															-        for info in info_list:
														
 
															-            href = ''
														
 
															-            title = ''
														
 
															-            create_time = ''
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = "*******记得编辑平台名称"
														
 
															-            data_item.area = "全国"  # 城市默认:全国
														
 
															-            data_item.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.xpath = ['//****',"*****"]
														
 
															-            list_item.author = "****"
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Jtyhgysmh(redis_key="fwork:Jtyhgysmh").start()
														
--- a/NoteWork/cesspider/华创迅采电子采购平台.py
+++ b/NoteWork/cesspider/华创迅采电子采购平台.py
@@ -1,91 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-01-04 13:48:52
														
 
															----------
														
 
															-@summary:  华创迅采电子采购平台   详情信息需登录
														
 
															----------
														
 
															-@author: topnet
														
 
															-"""
														
 
															-import json
														
 
															-from urllib.parse import urljoin
														
 
															-
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from untils.proxy_pool import ProxyPool
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-
														
 
															-class Hcxcdzcgpt(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-        self.count = 0
														
 
															-        self.host= 'https://www.bzeps.com/'
														
 
															-        Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-
														
 
															-        self.menus = [
														
 
															-            Menu('Hcxcdzcgpt', 'Hcxcdzcgpt', "Notice", 1),
														
 
															-            # Menu('Hcxcdzcgpt', 'Hcxcdzcgpt', "Notice", 1),
														
 
															-        ]
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        for menu in self.menus:
														
 
															-            for page in range(1, menu.crawl_page + 1):
														
 
															-                start_url = f'https://www.bzeps.com/list/purchase/{page}'
														
 
															-                data = {
														
 
															-                    "code": "purchase",
														
 
															-                    "keyword": "",
														
 
															-                    "searchType": ""
														
 
															-                }
														
 
															-                data = json.dumps(data)
														
 
															-                yield feapder.Request(url=start_url, item=menu._asdict(), method="POST", data=data)
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        menu = request.item
														
 
															-        self.count += 1  # 一个计数器
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.json.get("list")
														
 
															-        for info in info_list:
														
 
															-            href = urljoin(self.host, info.get("url"))
														
 
															-            title = info.get("title")
														
 
															-            create_time = info.get("pubTime")
														
 
															-            area = info.get("area")
														
 
															-            pro = area.split("-")[0]+"省"
														
 
															-            city = area.split("-")[-1]
														
 
															-            print(create_time,pro,city,title)
														
 
															-            print(href)
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = "华创迅采电子采购平台"
														
 
															-            data_item.area = pro  # 城市默认:全国
														
 
															-            data_item.city = city  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item = MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//div[@class="gbox"]']
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-    def download_midware(self, request):
														
 
															-        request.headers = {
														
 
															-            "Content-Type": "application/json; charset=UTF-8",
														
 
															-        }
														
 
															-        return request
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Hcxcdzcgpt(redis_key="fwork:Hcxcdzcgpt").start()
														
--- a/NoteWork/cesspider/国家税务总局宁波市税务局.py
+++ b/NoteWork/cesspider/国家税务总局宁波市税务局.py
@@ -1,70 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-22 11:01:07
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem
														
 
															-from untils.proxy_pool import ProxyPool
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-
														
 
															-class Gjswjzjnbswj(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         self.count = 0
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('Gjswjzjnbswj', 'Gjswjzjnbswj', "Notice", 1),
														
 
															-             Menu('Gjswjzjnbswj', 'Gjswjzjnbswj', "Notice", 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-            start_url = f''
														
 
															-            yield feapder.Request(url=start_url, item=menu._asdict())
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        menu = request.item
														
 
															-        self.count += 1   # 一个计数器
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = []
														
 
															-        for info in info_list:
														
 
															-            href = ''
														
 
															-            title = ''
														
 
															-            create_time = ''
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = "*******记得编辑平台名称"
														
 
															-            data_item.area = "全国"  # 城市默认:全国
														
 
															-            data_item.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.xpath = ['//****',"*****"]
														
 
															-            list_item.author = "****"
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Gjswjzjnbswj(redis_key="fwork:Gjswjzjnbswj").start()
														
--- a/NoteWork/cesspider/城轨采购网.py
+++ b/NoteWork/cesspider/城轨采购网.py
@@ -1,80 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-27 10:54:26
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: topnet
														
 
															-"""
														
 
															-
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem
														
 
															-from untils.proxy_pool import ProxyPool
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-def gotoPage(types,fid):  #onclick 的函数，生成url
														
 
															-    if types == "1" or types == "2":  # 比价公告
														
 
															-        return "/Purchase/Notice/NewDetail?Id="+fid
														
 
															-    elif types == "3":  # 在线询价
														
 
															-        return "https://work.mtrmart.com/Modules/SpareParts/SparePartsDispatch.ashx?ID=" + fid + "&AddNew=0"
														
 
															-    elif types == "4": # 招标项目
														
 
															-        return "/Bids/BidsNotice/NewDetail?Id="+fid
														
 
															-    elif types == "5": #单一来源公示
														
 
															-        return "/SingleSourceNotice/Notice/NewDetail?Id=" + fid
														
 
															-
														
 
															-class Cgcgw(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         self.count = 0
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('Cgcgw', 'Cgcgw', "Notice", 1),
														
 
															-             Menu('Cgcgw', 'Cgcgw', "Notice", 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-            start_url = f'https://www.mtrmart.com/Purchase/Notice/SearchNewList?title=&category=&noticeType=&noticeTypeStr=&NoSinglesource=&companyValue=&isInProgress=n&isOneYear=y&page=2&pageSize=10'
														
 
															-            yield feapder.Request(url=start_url, item=menu._asdict())
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        print(response.text)
														
 
															-        menu = request.item
														
 
															-        self.count += 1   # 一个计数器
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.xpath('//ul[@class="base-list"]/li')
														
 
															-        for info in info_list:
														
 
															-            href = "https://www.mtrmart.com/" + eval(info.xpath('./h6/span/@onclick').extract_first().strip(";"))
														
 
															-            title = info.xpath('./h6/@title').extract_first()
														
 
															-            create_time = info.xpath('./p/span[2]/text()').extract_first()
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = "城轨采购网"
														
 
															-            data_item.area = "全国"  # 城市默认:全国
														
 
															-            data_item.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//****',"*****"]
														
 
															-            list_item.author = "****"
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-        #     yield list_item
														
 
															-        # dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Cgcgw(redis_key="fwork:Cgcgw").start()
														
--- a/NoteWork/cesspider/山西省招标投标协会.py
+++ b/NoteWork/cesspider/山西省招标投标协会.py
@@ -1,74 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-27 10:34:50
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: topnet
														
 
															-"""
														
 
															-
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem
														
 
															-from untils.proxy_pool import ProxyPool
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-
														
 
															-class Sxsztbxh(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         self.count = 0
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('Sxsztbxh', 'Sxsztbxh', "Notice", 1),
														
 
															-             Menu('Sxsztbxh', 'Sxsztbxh', "Notice", 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-        for menu in self.menus:
														
 
															-            for page in range(1,menu.crawl_page+1):
														
 
															-                start_url = f'http://www.sxtba.com/prod-api/web/prequalificationList?releaseType=&pageNum={page}&pageSize=10'
														
 
															-                yield feapder.Request(url=start_url, item=menu._asdict())
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        print(response.text)
														
 
															-        menu = request.item
														
 
															-        self.count += 1   # 一个计数器
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.json.get("rows")
														
 
															-        for info in info_list:
														
 
															-            href = info.get("")
														
 
															-
														
 
															-            href = f'http://www.sxtba.com/home/zcInfoChildDetail?id={info}&noticeListRoute=NprequalificationNotice&projectTypes=otherBidding'
														
 
															-            title = info.get("noticeTitle")
														
 
															-            create_time = info.get("createTime")
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = "*******记得编辑平台名称"
														
 
															-            data_item.area = "全国"  # 城市默认:全国
														
 
															-            data_item.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//****',"*****"]
														
 
															-            list_item.author = "****"
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Sxsztbxh(redis_key="fwork:Sxsztbxh").start()
														
--- a/NoteWork/cesspider/广东测试.py
+++ b/NoteWork/cesspider/广东测试.py
@@ -1,32 +0,0 @@
 
															-import requests
														
 
															-
														
 
															-
														
 
															-headers = {
														
 
															-
														
 
															-    # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
														
 
															-}
														
 
															-url = "https://gdgpo.czt.gd.gov.cn/freecms/rest/v1/notice/selectInfoMoreChannel.do"
														
 
															-params = {
														
 
															-    "": "",
														
 
															-    "siteId": "cd64e06a-21a7-4620-aebc-0576bab7e07a",
														
 
															-    "channel": "fca71be5-fc0c-45db-96af-f513e9abda9d",
														
 
															-    "currPage": "1",
														
 
															-    "pageSize": "10",
														
 
															-    "noticeType": "00103",
														
 
															-    "regionCode": "440001",
														
 
															-    "verifyCode": "2158",
														
 
															-    "subChannel": "false",
														
 
															-    "purchaseManner": "",
														
 
															-    "title": "",
														
 
															-    "openTenderCode": "",
														
 
															-    "purchaser": "",
														
 
															-    "agency": "",
														
 
															-    "purchaseNature": "",
														
 
															-    "operationStartTime": "",
														
 
															-    "operationEndTime": "",
														
 
															-    "selectTimeName": "noticeTime"
														
 
															-}
														
 
															-response = requests.get(url, params=params)
														
 
															-
														
 
															-print(response.text)
														
 
															-print(response)
														
--- a/NoteWork/cesspider/广东省政府采购网.py
+++ b/NoteWork/cesspider/广东省政府采购网.py
@@ -1,137 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-01-18 09:41:49
														
 
															----------
														
 
															-@summary: Gdszfcgw
														
 
															----------
														
 
															-@author: dist
														
 
															-"""
														
 
															-import sys
														
 
															-from urllib.parse import urljoin
														
 
															-
														
 
															-import requests
														
 
															-
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder,time
														
 
															-from items.spider_item import DataBakItem,MgpListItem,ListItem
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-from untils.get_imgcode import get_code
														
 
															-#
														
 
															-# # custom_settings = { 'DOWNLOAD_DELAY': 10, 'CONCURRENT_REQUESTS_PER_IP': 4, 'DOWNLOADER_MIDDLEWARES': {}, }
														
 
															-# settings = { 'LOG_LEVEL': "INFO" }
														
 
															-class Gdszfcgw(feapder.Spider):
														
 
															-    def start_callback(self):
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'noticetype','notchannel', 'crawl_page'])
														
 
															-         self.site= "广东省政府采购网"
														
 
															-         self.host = 'https://gdgpo.czt.gd.gov.cn'
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('采购意向公开', 'gd_gdszfcgwxwz_cgyxgk','59','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
														
 
															-             Menu('单一来源公示', 'gd_gdszfcgwxwz_cggg_pccgyxgk','001051','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
														
 
															-             Menu('采购计划', 'gd_gdszfcgwxwz_cgjh', '001101','95ff31f3-a1af-4bc4-b1a2-54c894476193', 1),   #1
														
 
															-             Menu('采购需求', 'gd_gdszfcgwxwz_cgxq', '001059','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
														
 
															-             Menu('资格预审公告', 'gd_gdszfcgwxwz_zgysgg', '001052,001053','fca71be5-fc0c-45db-96af-f513e9abda9d', 1), #2
														
 
															-             Menu('采购公告', 'gd_gdszfcgwxwz_cggg', '00101','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
														
 
															-             Menu('中标成交公告', 'gd_gdszfcgwxwz_zbcjgg', '00102','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
														
 
															-             Menu('更正公告', 'gd_gdszfcgwxwz_gzgg', '00103','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
														
 
															-             Menu('终止公告', 'gd_gdszfcgwxwz_zzgg', '001004,001006','fca71be5-fc0c-45db-96af-f513e9abda9d', 1), #3
														
 
															-             Menu('合同公告', 'gd_gdszfcgwxwz_htgg', '001054','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
														
 
															-             Menu('验收公告', 'gd_gdszfcgwxwz_ysgg', '001009,00105A','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),  #4
														
 
															-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '201022,201023,201111,00107D','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
														
 
															-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '202022,202023,202111,00107E,001076','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
														
 
															-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '001071','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
														
 
															-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '204022,204023,204111,204112','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
														
 
															-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '001054', '3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  # 4
														
 
															-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '001009,00105A', '3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  # 4
														
 
															-
														
 
															-             # Menu('批量采购', 'gd_gdszfcgwxwz_plcg',
														
 
															-             #      'https://gdgpo.czt.gd.gov.cn/freecms/site/guangdong/dzmcgg/index.html', 1),
														
 
															-             # Menu('进口产品清单', 'gd_gdszfcgwxwz_jkcpqd',
														
 
															-             #      'https://gdgpo.czt.gd.gov.cn/freecms/site/guangdong/jkcpqd/index.html','','d7284b7e-29e9-4fe4-bad3-b187ec8edbf9' 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-        code = self.get_code()
														
 
															-        for menu in self.menus:
														
 
															-             for page in range(1,menu.crawl_page+1):
														
 
															-                 start_url = f'https://gdgpo.czt.gd.gov.cn/freecms/rest/v1/notice/selectInfoMoreChannel.do?&siteId=cd64e06a-21a7-4620-aebc-0576bab7e07a&channel={menu.notchannel}&currPage={page}&pageSize=10&noticeType={menu.noticetype}&regionCode=440001&verifyCode={code}&subChannel=false&purchaseManner=&title=&openTenderCode=&purchaser=&agency=&purchaseNature=&operationStartTime=&operationEndTime=&selectTimeName=noticeTime'
														
 
															-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False)
														
 
															-    def get_code(self):
														
 
															-        img_url = 'https://gdgpo.czt.gd.gov.cn/freecms/verify/verifyCode.do?createTypeFlag=n'
														
 
															-        header = {"Host": "www.ccgp-tianjin.gov.cn",
														
 
															-                  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0",
														
 
															-                  "Origin": "http://www.ccgp-tianjin.gov.cn",
														
 
															-
														
 
															-                  }
														
 
															-        res = requests.get(img_url, headers=header)
														
 
															-        with open('image/guangdong.jpg', 'wb+') as f:
														
 
															-            f.write(res.content)
														
 
															-        res = get_code('image/guangdong.jpg')
														
 
															-        if res.get("msg")=="success":
														
 
															-            img_code = res.get("r").get("code")
														
 
															-        else:
														
 
															-            img_code = None
														
 
															-        return img_code
														
 
															-
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        time.sleep(0.3)
														
 
															-        menu = request.item
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.json.get("data")
														
 
															-        for info in info_list:
														
 
															-            href = info.get("pageurl")
														
 
															-            title = info.get("shorttitle")
														
 
															-            create_time = info.get("addtimeStr")
														
 
															-            href = urljoin(self.host, href)
														
 
															-
														
 
															-            area = "广东"  # 省份
														
 
															-            city = ""  # 城市
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = self.site
														
 
															-            data_item.area = area  # 城市默认:全国
														
 
															-            data_item.city = city  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//div[@class="info-article in active"]']
														
 
															-            list_item.proxies = False
														
 
															-            list_item.parse_url = href
														
 
															-            list_item.pri = 1
														
 
															-            list_item.files={
														
 
															-                "list_xpath":'//div[@class="info-article in active"]//div/a',
														
 
															-                "url_xpath":'./@href',
														
 
															-                "name_xpath":'./text()',
														
 
															-                "files_type":('zip','doxc','ftp','pdf'), # 需要下载的附件类型
														
 
															-                # "file_type":'zip', # 默认的附件类型，用于url中未带附件类型的
														
 
															-                "url_key":'http', # 用于区别连接是否为正常附件连接的url关键词 必须携带，如无可填http
														
 
															-                # "host":'http://www.ceshi.com',  # 需要拼接url的host
														
 
															-            }
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        list = ListItem()
														
 
															-        list.site = self.site
														
 
															-        list.channel = menu.get("channel")
														
 
															-        list.spidercode = menu.get("code")
														
 
															-        list.url = request.url
														
 
															-        list.count = len(info_list)
														
 
															-        list.rel_count = len(href_list)
														
 
															-        yield list
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Gdszfcgw(redis_key="dist:Gdszfcgw").start()
														
--- a/NoteWork/cesspider/测试查询.py
+++ b/NoteWork/cesspider/测试查询.py
@@ -1,9 +0,0 @@
 
															-from feapder.dedup import Dedup
														
 
															-url='http://www.ccgp-tianjin.gov.cn/viewer.do?id=299263823&ver=2'
														
 
															-
														
 
															-dedup = Dedup(Dedup.BloomFilter)
														
 
															-ss = dedup.filter_exist_data([url])
														
 
															-if ss == []:
														
 
															-    print('不存在，未在此库操作')
														
 
															-else:
														
 
															-    print('以去重')
														
--- a/NoteWork/cesspider/滁州市人民政府网.py
+++ b/NoteWork/cesspider/滁州市人民政府网.py
@@ -1,114 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-01-14 20:02:21
														
 
															----------
														
 
															-@summary: 滁州市人民政府网
														
 
															----------
														
 
															-@author: mgp
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem,ListItem
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-
														
 
															-class Czsrmzf(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-         self.site= "滁州市人民政府网"
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('政府信息公开目录-公立医疗机构药品医用设备采购', 'ah_czsrmzfw_gcztb_zbgg', "自定义参数", 1),
														
 
															-             Menu('重大建设项目-招标投标信息', 'ah_czsrmzfw_zfcg_cggg', "自定义参数", 1),
														
 
															-             Menu('政府采购', 'ah_czsrmzfw_gcztb_zbgs', "Notice", 1),
														
 
															-             Menu('工程建设招投标', 'ah_czsrmzfw_zfcg_zbcjgg', "Notice", 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-             for page in range(1,menu.crawl_page+1):
														
 
															-                 start_url = f'https://www.chuzhou.gov.cn/chuzhou/site/label/8888'
														
 
															-                 parmars = params = {
														
 
															-                        "IsAjax": "1",
														
 
															-                        "dataType": "html",
														
 
															-                        "_": "0.5840033326645138",
														
 
															-                        "labelName": "publicInfoList",
														
 
															-                        "siteId": "2653861",
														
 
															-                        "pageSize": "20",
														
 
															-                        "pageIndex": "3",
														
 
															-                        "action": "list",
														
 
															-                        "isDate": "true",
														
 
															-                        "dateFormat": "yyyy-MM-dd",
														
 
															-                        "length": "50",
														
 
															-                        "organId": "2681509",
														
 
															-                        "type": "4",
														
 
															-                        "catId": "161735369",
														
 
															-                        "cId": "",
														
 
															-                        "result": "暂无相关信息",
														
 
															-                        "title": "",
														
 
															-                        "fileNum": "",
														
 
															-                        "keyWords": "",
														
 
															-                        "file": "/c1/chuzhou/publicInfoList_newest"
														
 
															-                    }
														
 
															-                 yield feapder.Request(url=start_url,params=parmars, item=menu._asdict(),proxies=False)
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        menu = request.item
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.xpath("//ul")
														
 
															-        for info in info_list:
														
 
															-            href = info.xpath("./li/a/@href").extract_first().strip()
														
 
															-            title = info.xpath("./li/a/@title").extract_first().strip()
														
 
															-            create_time = info.xpath("./li/span/text()").extract_first().strip()
														
 
															-            area = "安徽"  # 省份
														
 
															-            city = "滁州市"  # 城市
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = self.site
														
 
															-            data_item.area = area  # 城市默认:全国
														
 
															-            data_item.city = city  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            # if ss == []:
														
 
															-            #     continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            # list_item.parser_name = "detail_firefox"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//div[@class="contentbox minh500"]']
														
 
															-            list_item.proxies = False
														
 
															-            list_item.parse_url = href
														
 
															-            list_item.pri = 1
														
 
															-            list_item.files={
														
 
															-                "list_xpath":'//a[contains(@data-file-ext,"D")]',
														
 
															-                "url_xpath":'./@href',
														
 
															-                "name_xpath":'./text()',
														
 
															-                "files_type":('zip','docx','ftp'), # 需要下载的附件类型
														
 
															-                "url_key": 'http',  # 用于区别连接是否为正常附件连接的url关键词 必须携带，如无可填http
														
 
															-                "host": 'https://www.chuzhou.gov.cn'
														
 
															-            }
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        list = ListItem()
														
 
															-        list.site = self.site
														
 
															-        list.channel = menu.get("channel")
														
 
															-        list.spidercode = menu.get("code")
														
 
															-        list.url = request.url
														
 
															-        list.count = len(info_list)
														
 
															-        list.rel_count = len(href_list)
														
 
															-        yield list
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Czsrmzf(redis_key="magp:Czsrmzf").start()
														
--- a/NoteWork/cesspider/甘肃政府采购网.py
+++ b/NoteWork/cesspider/甘肃政府采购网.py
@@ -1,197 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-01 16:37:53
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from untils.proxy_pool import ProxyPool
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-import time
														
 
															-from lxml import etree
														
 
															-import re
														
 
															-
														
 
															-
														
 
															-class Gszfcg(feapder.Spider):
														
 
															-    # 自定义数据库，若项目中有setting.py文件，此自定义可删除
														
 
															-    def start_callback(self):
														
 
															-        self.count = 0
														
 
															-        self.prox_pool = ProxyPool()
														
 
															-        self.cookie = None
														
 
															-        self.host = 'http://www.ccgp-gansu.gov.cn/'
														
 
															-        Menu = namedtuple('Menu', ['channel', 'code', "parse", 'render_time', 'url', 'crawl_page'])
														
 
															-
														
 
															-        self.menus = [
														
 
															-            Menu('定点采购', 'a_gszfcgw_ddcg', "self.parse_num1", 2,
														
 
															-                 "http://www.ccgp-gansu.gov.cn/web/article/142/{crawl_page}/index.htm", 1),
														
 
															-            Menu('协议供货-公告栏', 'a_gszfcgw_xygh_ggl', "self.parse_num3",2,
														
 
															-                 "http://www.ccgp-gansu.gov.cn/web/article/13001/{crawl_page}/index.htm", 1),
														
 
															-            # Menu('协议供货定点采购合同', 'a_gszfcgw_xyghddcght',  "self.parse_num1",2, "Notice", 1),
														
 
															-            Menu('招标项目合同', 'a_gszfcgw_zbxmht', "self.parse_num1", 2,
														
 
															-                 "http://www.ccgp-gansu.gov.cn/web/contract/{crawl_page}/index.htm?contractsInfo.id=d0", 13),
														
 
															-            Menu('最新标讯', 'a_gszfcgw_zxbx', "self.parse_num2", 2,
														
 
															-                 "http://www.ccgp-gansu.gov.cn/web/articlenews/1/{crawl_page}/index.htm?articleSearchInfo.days=21&articleSearchInfo.division=d0",
														
 
															-                 10),
														
 
															-            Menu('综合查询-全部', 'gs_gszfcgw_zhcx_qb', "self.parse",2,
														
 
															-                 "http://www.ccgp-gansu.gov.cn/web/doSearchmxarticlelssj.action", 1),
														
 
															-        ]
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        for menu in self.menus:
														
 
															-            print(menu.parse)
														
 
															-            for page in range(menu.crawl_page):
														
 
															-                url = menu.url.format(crawl_page=page*10)
														
 
															-                print(url)
														
 
															-                yield feapder.Request(url=url, item=menu._asdict(), render=True, callback=eval(menu.parse),render_time=2)
														
 
															-
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        browser = response.browser
														
 
															-        browser.find_element_by_name("button").click()
														
 
															-        self.cookie = response.cookies
														
 
															-        smenu = request.item
														
 
															-        response = etree.HTML(browser.page_source)
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        for info in response.xpath("//ul[@class='Expand_SearchSLisi']/li"):
														
 
															-            title = info.xpath('./a/text()')[0]
														
 
															-            href = self.host + info.xpath('./a/@href')[0]
														
 
															-            create_time = re.findall(r'\| 发布时间：(.*?) \|', etree.tounicode(info))[0]
														
 
															-
														
 
															-            item_data = DataBakItem()  # 存储数据的管道
														
 
															-            item_data.href = href  # 标书链接
														
 
															-            item_data.channel = smenu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            item_data.spidercode = smenu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            item_data.title = title  # 标题
														
 
															-            item_data.publishtime = create_time  # 标书发布时间
														
 
															-            item_data.site = "甘肃政府采购网"
														
 
															-            item_data.area = "甘肃省"  # 城市默认:全国
														
 
															-            item_data.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item = MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details_firefox"
														
 
															-            list_item.item = item_data.to_dict
														
 
															-            list_item.deal_detail = ["//div[@class='articleCon']"]
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def parse_num1(self, request, response):
														
 
															-        menu = request.item
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        for info in response.xpath("//ul[@class='newsList']/li"):
														
 
															-            title = info.xpath('./span[2]/a/text()').extract_first()
														
 
															-            if title is None:
														
 
															-                continue
														
 
															-            href = info.xpath('./span[2]/a/@href').extract_first()
														
 
															-            create_time = info.xpath('./span[1]/text()').extract_first().strip() + ' 00:00:00'
														
 
															-
														
 
															-            item_data = DataBakItem()  # 存储数据的管道
														
 
															-            item_data.href = href  # 标书链接
														
 
															-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            item_data.title = title  # 标题
														
 
															-            item_data.publishtime = create_time  # 标书发布时间
														
 
															-            item_data.site = "甘肃政府采购网"
														
 
															-            item_data.area = "甘肃省"  # 城市默认:全国
														
 
															-            item_data.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item = MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details_firefox"
														
 
															-            list_item.item = item_data.to_dict
														
 
															-            list_item.deal_detail = ["//div[@class='mBd']"]
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-    def parse_num2(self, request, response):
														
 
															-        menu = request.item
														
 
															-        cookie = response.cookies
														
 
															-        info_list = response.xpath("//*[@class='mBd']/ul/li")
														
 
															-        if not info_list and menu.get("render_time")<5:
														
 
															-            yield feapder.Request(url=request.url, item=menu,callback=self.parse_num2,cookies=response.cookies)
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        for info in info_list:
														
 
															-            title = info.xpath('./a/text()').extract_first()
														
 
															-            if title is None:
														
 
															-                continue
														
 
															-            href = info.xpath('./a/@href').extract_first()
														
 
															-            create_time = info.xpath('./p/span/text()').extract_first().strip()
														
 
															-            create_time = re.findall('审核时间：(.*?) \|',create_time)[0]
														
 
															-
														
 
															-            item_data = DataBakItem()  # 存储数据的管道
														
 
															-            item_data.href = href  # 标书链接
														
 
															-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            item_data.title = title  # 标题
														
 
															-            item_data.publishtime = create_time  # 标书发布时间
														
 
															-            item_data.site = "甘肃政府采购网"
														
 
															-            item_data.area = "甘肃省"  # 城市默认:全国
														
 
															-            item_data.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item = MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details_firefox"
														
 
															-            list_item.item = item_data.to_dict
														
 
															-            list_item.deal_detail = ["//div[@class='mBd']"]
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-    def parse_num3(self, request, response):
														
 
															-        menu = request.item
														
 
															-        info_list = response.xpath("//*[@class='mBd']/ul/li")
														
 
															-        if not info_list and menu.get("render_time")<5:
														
 
															-            yield feapder.Request(url=request.url, item=menu,callback=self.parse_num3,cookies=response.cookies)
														
 
															-
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        for info in info_list:
														
 
															-            title = info.xpath('./span[2]/a/text()').extract_first()
														
 
															-            if title is None:
														
 
															-                continue
														
 
															-            href = info.xpath('./span[2]/a/@href').extract_first()
														
 
															-            create_time = info.xpath('./span[1]/text()').extract_first().strip() + ' 00:00:00'
														
 
															-
														
 
															-            item_data = DataBakItem()  # 存储数据的管道
														
 
															-            item_data.href = href  # 标书链接
														
 
															-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            item_data.title = title  # 标题
														
 
															-            item_data.publishtime = create_time  # 标书发布时间
														
 
															-            item_data.site = "甘肃政府采购网"
														
 
															-            item_data.area = "甘肃省"  # 城市默认:全国
														
 
															-            item_data.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item = MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details_firefox"
														
 
															-            list_item.item = item_data.to_dict
														
 
															-            list_item.deal_detail = ["//div[@class='mBd']"]
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Gszfcg(redis_key="magp:gszfcg").start()
														
--- a/NoteWork/cesspider/甘肃政府采购网_ces.py
+++ b/NoteWork/cesspider/甘肃政府采购网_ces.py
@@ -1,213 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-01 16:37:53
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import feapder
														
 
															-from feapder.network.cookie_pool import PageCookiePool
														
 
															-
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from untils.proxy_pool import ProxyPool
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-import time
														
 
															-from lxml import etree
														
 
															-import re
														
 
															-
														
 
															-
														
 
															-class Gszfcg(feapder.Spider):
														
 
															-    # 自定义数据库，若项目中有setting.py文件，此自定义可删除
														
 
															-    cookie_pool = PageCookiePool(redis_key='fwork:gszfcg',
														
 
															-                                 page_url='http://www.ccgp-gansu.gov.cn/web/article/142/0/index.htm',driver_type='FIREFOX',executable_path="D:\\geckodriver.exe")
														
 
															-    def start_callback(self):
														
 
															-        self.count = 0
														
 
															-        self.cookie = None
														
 
															-        self.host = 'http://www.ccgp-gansu.gov.cn/'
														
 
															-        Menu = namedtuple('Menu', ['channel', 'code', "parse", 'render_time', 'url', 'crawl_page'])
														
 
															-
														
 
															-
														
 
															-        self.menus = [
														
 
															-            Menu('定点采购', 'a_gszfcgw_ddcg', "self.parse_num1", 2,
														
 
															-                 "http://www.ccgp-gansu.gov.cn/web/article/142/0/index.htm", 1),
														
 
															-            Menu('协议供货-公告栏', 'a_gszfcgw_xygh_ggl', "self.parse_num3",2,
														
 
															-                 "http://www.ccgp-gansu.gov.cn/web/article/13001/0/index.htm", 1),
														
 
															-            # Menu('协议供货定点采购合同', 'a_gszfcgw_xyghddcght',  "self.parse_num1",2, "Notice", 1),
														
 
															-            Menu('招标项目合同', 'a_gszfcgw_zbxmht', "self.parse_num1", 2,
														
 
															-                 "http://www.ccgp-gansu.gov.cn/web/contract/0/index.htm?contractsInfo.id=d0", 1),
														
 
															-            Menu('最新标讯', 'a_gszfcgw_zxbx', "self.parse_num2", 2,
														
 
															-                 "http://www.ccgp-gansu.gov.cn/web/articlenews/1/0/index.htm?articleSearchInfo.days=21&articleSearchInfo.division=d0",
														
 
															-                 1),
														
 
															-            Menu('综合查询-全部', 'gs_gszfcgw_zhcx_qb', "self.parse",2,
														
 
															-                 "http://www.ccgp-gansu.gov.cn/web/doSearchmxarticlelssj.action", 1),
														
 
															-        ]
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        for menu in self.menus:
														
 
															-            print(menu.parse)
														
 
															-            yield feapder.Request(url=menu.url, item=menu._asdict(),callback=eval(menu.parse))
														
 
															-
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        browser = response.browser
														
 
															-        browser.find_element_by_name("button").click()
														
 
															-        # self.cookie = response.cookies
														
 
															-        smenu = request.item
														
 
															-        response = etree.HTML(browser.page_source)
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        for info in response.xpath("//ul[@class='Expand_SearchSLisi']/li"):
														
 
															-            title = info.xpath('./a/text()')[0]
														
 
															-            href = self.host + info.xpath('./a/@href')[0]
														
 
															-            create_time = re.findall(r'\| 发布时间：(.*?) \|', etree.tounicode(info))[0]
														
 
															-
														
 
															-            item_data = DataBakItem()  # 存储数据的管道
														
 
															-            item_data.href = href  # 标书链接
														
 
															-            item_data.channel = smenu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            item_data.spidercode = smenu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            item_data.title = title  # 标题
														
 
															-            item_data.publishtime = create_time  # 标书发布时间
														
 
															-            item_data.site = "甘肃政府采购网"
														
 
															-            item_data.area = "甘肃省"  # 城市默认:全国
														
 
															-            item_data.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item = MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = item_data.to_dict
														
 
															-            list_item.deal_detail = ["//div[@class='articleCon']"]
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def parse_num1(self, request, response):
														
 
															-        print(response.text)
														
 
															-        menu = request.item
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        print('newsList_count',len(response.xpath("//ul[@class='newsList']/li")))
														
 
															-        for info in response.xpath("//ul[@class='newsList']/li"):
														
 
															-            title = info.xpath('./span[2]/a/text()').extract_first()
														
 
															-            if title is None:
														
 
															-                continue
														
 
															-            href = info.xpath('./span[2]/a/@href').extract_first()
														
 
															-            create_time = info.xpath('./span[1]/text()').extract_first().strip() + ' 00:00:00'
														
 
															-            print(create_time,title)
														
 
															-
														
 
															-            item_data = DataBakItem()  # 存储数据的管道
														
 
															-            item_data.href = href  # 标书链接
														
 
															-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            item_data.title = title  # 标题
														
 
															-            item_data.publishtime = create_time  # 标书发布时间
														
 
															-            item_data.site = "甘肃政府采购网"
														
 
															-            item_data.area = "甘肃省"  # 城市默认:全国
														
 
															-            item_data.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item = MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = item_data.to_dict
														
 
															-            list_item.deal_detail = ["//div[@class='mBd']"]
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-    def parse_num2(self, request, response):
														
 
															-        menu = request.item
														
 
															-        cookie = response.cookies
														
 
															-        info_list = response.xpath("//*[@class='mBd']/ul/li")
														
 
															-        if not info_list and menu.get("render_time")<5:
														
 
															-            yield feapder.Request(url=request.url, item=menu,callback=self.parse_num2,cookies=response.cookies)
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        for info in info_list:
														
 
															-            title = info.xpath('./a/text()').extract_first()
														
 
															-            if title is None:
														
 
															-                continue
														
 
															-            href = info.xpath('./a/@href').extract_first()
														
 
															-            create_time = info.xpath('./p/span/text()').extract_first().strip()
														
 
															-            create_time = re.findall('审核时间：(.*?) \|',create_time)[0]
														
 
															-
														
 
															-            item_data = DataBakItem()  # 存储数据的管道
														
 
															-            item_data.href = href  # 标书链接
														
 
															-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            item_data.title = title  # 标题
														
 
															-            item_data.publishtime = create_time  # 标书发布时间
														
 
															-            item_data.site = "甘肃政府采购网"
														
 
															-            item_data.area = "甘肃省"  # 城市默认:全国
														
 
															-            item_data.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item = MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = item_data.to_dict
														
 
															-            list_item.deal_detail = ["//div[@class='mBd']"]
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-    def parse_num3(self, request, response):
														
 
															-        menu = request.item
														
 
															-        info_list = response.xpath("//*[@class='mBd']/ul/li")
														
 
															-        if not info_list and menu.get("render_time")<5:
														
 
															-            yield feapder.Request(url=request.url, item=menu,callback=self.parse_num3,cookies=response.cookies)
														
 
															-
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        for info in info_list:
														
 
															-            title = info.xpath('./span[2]/a/text()').extract_first()
														
 
															-            if title is None:
														
 
															-                continue
														
 
															-            href = info.xpath('./span[2]/a/@href').extract_first()
														
 
															-            create_time = info.xpath('./span[1]/text()').extract_first().strip() + ' 00:00:00'
														
 
															-
														
 
															-            item_data = DataBakItem()  # 存储数据的管道
														
 
															-            item_data.href = href  # 标书链接
														
 
															-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            item_data.title = title  # 标题
														
 
															-            item_data.publishtime = create_time  # 标书发布时间
														
 
															-            item_data.site = "甘肃政府采购网"
														
 
															-            item_data.area = "甘肃省"  # 城市默认:全国
														
 
															-            item_data.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item = MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = item_data.to_dict
														
 
															-            list_item.deal_detail = ["//div[@class='mBd']"]
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-    def download_midware(self, request):
														
 
															-        request.headers = {
														
 
															-            "Connection": "keep-alive",
														
 
															-            "Cache-Control": "max-age=0",
														
 
															-            "Upgrade-Insecure-Requests": "1",
														
 
															-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
														
 
															-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
														
 
															-            "Referer": "http://www.ccgp-gansu.gov.cn/web/article/142/0/index.htm",
														
 
															-            "Accept-Language": "zh-CN,zh;q=0.9"
														
 
															-        }
														
 
															-
														
 
															-        request.cookies = self.cookie_pool.get_cookie()
														
 
															-        return request
														
 
															-
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Gszfcg(redis_key="magp:gszfcg").start()
														
--- a/NoteWork/cesspider/甘肃政府采购网_new.py
+++ b/NoteWork/cesspider/甘肃政府采购网_new.py
@@ -1,194 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-01 16:37:53
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from untils.proxy_pool import ProxyPool
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-import time
														
 
															-from lxml import etree
														
 
															-import re
														
 
															-
														
 
															-
														
 
															-class Gszfcg(feapder.Spider):
														
 
															-    # 自定义数据库，若项目中有setting.py文件，此自定义可删除
														
 
															-    def start_callback(self):
														
 
															-        self.count = 0
														
 
															-        self.prox_pool = ProxyPool()
														
 
															-        self.cookie = None
														
 
															-        self.host = 'http://www.ccgp-gansu.gov.cn/'
														
 
															-        Menu = namedtuple('Menu', ['channel', 'code', "parse", 'render_time', 'url', 'crawl_page'])
														
 
															-
														
 
															-        self.menus = [
														
 
															-            Menu('定点采购', 'a_gszfcgw_ddcg', "self.parse_num1", 2,
														
 
															-                 "http://www.ccgp-gansu.gov.cn/web/article/142/0/index.htm", 1),
														
 
															-            Menu('协议供货-公告栏', 'a_gszfcgw_xygh_ggl', "self.parse_num3",2,
														
 
															-                 "http://www.ccgp-gansu.gov.cn/web/article/13001/0/index.htm", 1),
														
 
															-            # Menu('协议供货定点采购合同', 'a_gszfcgw_xyghddcght',  "self.parse_num1",2, "Notice", 1),
														
 
															-            Menu('招标项目合同', 'a_gszfcgw_zbxmht', "self.parse_num1", 2,
														
 
															-                 "http://www.ccgp-gansu.gov.cn/web/contract/0/index.htm?contractsInfo.id=d0", 1),
														
 
															-            Menu('最新标讯', 'a_gszfcgw_zxbx', "self.parse_num2", 2,
														
 
															-                 "http://www.ccgp-gansu.gov.cn/web/articlenews/1/0/index.htm?articleSearchInfo.days=21&articleSearchInfo.division=d0",
														
 
															-                 1),
														
 
															-            Menu('综合查询-全部', 'gs_gszfcgw_zhcx_qb', "self.parse",2,
														
 
															-                 "http://www.ccgp-gansu.gov.cn/web/doSearchmxarticlelssj.action", 1),
														
 
															-        ]
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        for menu in self.menus:
														
 
															-            print(menu.parse)
														
 
															-            yield feapder.Request(url=menu.url, item=menu._asdict(), render=True, callback=eval(menu.parse),render_time=2)
														
 
															-
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        browser = response.browser
														
 
															-        browser.find_element_by_name("button").click()
														
 
															-        # self.cookie = response.cookies
														
 
															-        smenu = request.item
														
 
															-        response = etree.HTML(browser.page_source)
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        for info in response.xpath("//ul[@class='Expand_SearchSLisi']/li"):
														
 
															-            title = info.xpath('./a/text()')[0]
														
 
															-            href = self.host + info.xpath('./a/@href')[0]
														
 
															-            create_time = re.findall(r'\| 发布时间：(.*?) \|', etree.tounicode(info))[0]
														
 
															-
														
 
															-            item_data = DataBakItem()  # 存储数据的管道
														
 
															-            item_data.href = href  # 标书链接
														
 
															-            item_data.channel = smenu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            item_data.spidercode = smenu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            item_data.title = title  # 标题
														
 
															-            item_data.publishtime = create_time  # 标书发布时间
														
 
															-            item_data.site = "甘肃政府采购网"
														
 
															-            item_data.area = "甘肃省"  # 城市默认:全国
														
 
															-            item_data.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item = MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = item_data.to_dict
														
 
															-            list_item.deal_detail = ["//div[@class='articleCon']"]
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def parse_num1(self, request, response):
														
 
															-        menu = request.item
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        for info in response.xpath("//ul[@class='newsList']/li"):
														
 
															-            title = info.xpath('./span[2]/a/text()').extract_first()
														
 
															-            if title is None:
														
 
															-                continue
														
 
															-            href = info.xpath('./span[2]/a/@href').extract_first()
														
 
															-            create_time = info.xpath('./span[1]/text()').extract_first().strip() + ' 00:00:00'
														
 
															-
														
 
															-            item_data = DataBakItem()  # 存储数据的管道
														
 
															-            item_data.href = href  # 标书链接
														
 
															-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            item_data.title = title  # 标题
														
 
															-            item_data.publishtime = create_time  # 标书发布时间
														
 
															-            item_data.site = "甘肃政府采购网"
														
 
															-            item_data.area = "甘肃省"  # 城市默认:全国
														
 
															-            item_data.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item = MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = item_data.to_dict
														
 
															-            list_item.deal_detail = ["//div[@class='mBd']"]
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-    def parse_num2(self, request, response):
														
 
															-        menu = request.item
														
 
															-        cookie = response.cookies
														
 
															-        info_list = response.xpath("//*[@class='mBd']/ul/li")
														
 
															-        if not info_list and menu.get("render_time")<5:
														
 
															-            yield feapder.Request(url=request.url, item=menu,callback=self.parse_num2,cookies=response.cookies)
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        for info in info_list:
														
 
															-            title = info.xpath('./a/text()').extract_first()
														
 
															-            if title is None:
														
 
															-                continue
														
 
															-            href = info.xpath('./a/@href').extract_first()
														
 
															-            create_time = info.xpath('./p/span/text()').extract_first().strip()
														
 
															-            create_time = re.findall('审核时间：(.*?) \|',create_time)[0]
														
 
															-
														
 
															-            item_data = DataBakItem()  # 存储数据的管道
														
 
															-            item_data.href = href  # 标书链接
														
 
															-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            item_data.title = title  # 标题
														
 
															-            item_data.publishtime = create_time  # 标书发布时间
														
 
															-            item_data.site = "甘肃政府采购网"
														
 
															-            item_data.area = "甘肃省"  # 城市默认:全国
														
 
															-            item_data.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item = MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = item_data.to_dict
														
 
															-            list_item.deal_detail = ["//div[@class='mBd']"]
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-    def parse_num3(self, request, response):
														
 
															-        menu = request.item
														
 
															-        info_list = response.xpath("//*[@class='mBd']/ul/li")
														
 
															-        if not info_list and menu.get("render_time")<5:
														
 
															-            yield feapder.Request(url=request.url, item=menu,callback=self.parse_num3,cookies=response.cookies)
														
 
															-
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        for info in info_list:
														
 
															-            title = info.xpath('./span[2]/a/text()').extract_first()
														
 
															-            if title is None:
														
 
															-                continue
														
 
															-            href = info.xpath('./span[2]/a/@href').extract_first()
														
 
															-            create_time = info.xpath('./span[1]/text()').extract_first().strip() + ' 00:00:00'
														
 
															-
														
 
															-            item_data = DataBakItem()  # 存储数据的管道
														
 
															-            item_data.href = href  # 标书链接
														
 
															-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            item_data.title = title  # 标题
														
 
															-            item_data.publishtime = create_time  # 标书发布时间
														
 
															-            item_data.site = "甘肃政府采购网"
														
 
															-            item_data.area = "甘肃省"  # 城市默认:全国
														
 
															-            item_data.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item = MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = item_data.to_dict
														
 
															-            list_item.deal_detail = ["//div[@class='mBd']"]
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Gszfcg(redis_key="magp:gszfcg").start()
														
--- a/NoteWork/cesspider/福建省政府采购网.py
+++ b/NoteWork/cesspider/福建省政府采购网.py
@@ -1,106 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-01-06 16:37:37
														
 
															----------
														
 
															-@summary: 福建省政府采购网.py
														
 
															----------
														
 
															-@author: FworkSpider
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-import random
														
 
															-import requests
														
 
															-from untils.chaojiying import Chaojiying_Client
														
 
															-
														
 
															-class Fjszfcgw(feapder.Spider):
														
 
															-    str = '天仙丛付印五仔六五乐四甩瓜九七一失令斤册禾十仗丘非田白付乐仪八代匆乎二们句生四用'
														
 
															-    def start_callback(self):
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('项目公告-全部', 'fj_fjszfcgw_xmgg_qb', "自定义参数", 10),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-             # for page in range(1,menu.crawl_page+1):
														
 
															-                 start_url = f'http://www.ccgp-fujian.gov.cn/3500/noticelist/e8d2cd51915e4c338dc1c6ee2f02b127/?page={1}&verifycode={"".join(random.sample(self.str,4))}'
														
 
															-                 yield feapder.Request(url=start_url, item=menu._asdict(),page=1,render=True,render_time=2)
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        # print(response.text)
														
 
															-        menu = request.item
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.xpath("//tbody/tr")
														
 
															-        if info_list == []:
														
 
															-            img_url = 'http://www.ccgp-fujian.gov.cn/noticeverifycode/?1'
														
 
															-            print('出现验证码')
														
 
															-            img_res = requests.get(img_url)
														
 
															-            with open('a.jpg', 'wb+') as f:
														
 
															-                f.write(img_res.content)
														
 
															-            # chaojiying = Chaojiying_Client('ddddjy', 'ddddjy2021', '超级鹰')  # 用户中心>>软件ID 生成一个替换 96001
														
 
															-            # im = open('a.jpg', 'rb').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
														
 
															-            # print(chaojiying.PostPic(im, 1902))
														
 
															-            # res = chaojiying.PostPic(im, 2004)
														
 
															-            # print(res)
														
 
															-            # if res.get("err_no") != 0:
														
 
															-            #     chaojiying.ReportError(res.get("pic_id"))
														
 
															-            # code = res.get("pic_str")
														
 
															-            url = request.url[:-4]+"".join(random.sample(self.str,4))
														
 
															-            yield feapder.Request(url=url, item=menu,random_user_agent=False,page=request.page,render=True,render_time=2)
														
 
															-            return
														
 
															-        for info in info_list:
														
 
															-            href = info.xpath('./td/a/@href').extract_first()
														
 
															-            title = info.xpath('./td/a/text()').extract_first()
														
 
															-            create_time = info.xpath('./td[5]/text()').extract_first()
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = "福建省政府采购网"
														
 
															-            data_item.area = "福建"  # 城市默认:全国
														
 
															-            data_item.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//div[@class="notice-con"]']
														
 
															-            list_item.proxies = False
														
 
															-            list_item.parse_url = href
														
 
															-            list_item.pri = 1
														
 
															-            list_item.files={
														
 
															-                "list_xpath":'//div[@class="notice-foot"]/a',
														
 
															-                "url_xpath":'./@href',
														
 
															-                "name_xpath":'./text()',
														
 
															-                "files_type":('zip','doxc','ftp'),
														
 
															-                "file_type":'zip',
														
 
															-                "url_key":'attach',
														
 
															-            }
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-        page_url =  f'http://www.ccgp-fujian.gov.cn/3500/noticelist/e8d2cd51915e4c338dc1c6ee2f02b127/?page={request.page+1}&verifycode={"".join(random.sample(self.str,4))}'
														
 
															-        if request.page < menu.get("crawl_page"):
														
 
															-            yield feapder.Request(url=page_url, use_session=True, item=menu, proxies=False,
														
 
															-                                  random_user_agent=False, page=request.page+1)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-    def download_midware(self, request):
														
 
															-        request.headers={
														
 
															-            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
														
 
															-        }
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Fjszfcgw(redis_key="FworkSpider:Fjszfcgw2").start()
														
--- a/NoteWork/cesspider/黔云招采电子招标采购交易平台
+++ b/NoteWork/cesspider/黔云招采电子招标采购交易平台
@@ -1,24 +0,0 @@
 
															-Qyzcdzzbcgjypt|2022-01-10 17:58:08,690|scheduler.py|<lambda>|line:112|INFO| 
														
 
															-********** feapder begin **********
														
 
															-Thread-5|2022-01-10 17:58:17,753|parser_control.py|deal_requests|line:249|ERROR| 
														
 
															-                            -------------- Qyzcdzzbcgjypt.parse error -------------
														
 
															-                            error          XPath error: Invalid expression in //*[
														
 
															-                            response       <Response [500]>
														
 
															-                            deal request   <Request https://www.e-qyzc.com/gg/toXinXiList>
														
 
															-                            
														
 
															-Thread-5|2022-01-10 17:58:17,773|parser_control.py|deal_requests|line:349|INFO| 
														
 
															-                                    入库 等待重试
														
 
															-                                    url     https://www.e-qyzc.com/gg/toXinXiList
														
 
															-                                    重试次数 1
														
 
															-                                    最大允许重试次数 2
														
 
															-Thread-5|2022-01-10 17:58:20,708|parser_control.py|deal_requests|line:249|ERROR| 
														
 
															-                            -------------- Qyzcdzzbcgjypt.parse error -------------
														
 
															-                            error          XPath error: Invalid expression in //*[
														
 
															-                            response       <Response [500]>
														
 
															-                            deal request   <Request https://www.e-qyzc.com/gg/toXinXiList>
														
 
															-                            
														
 
															-Thread-5|2022-01-10 17:58:20,709|parser_control.py|deal_requests|line:349|INFO| 
														
 
															-                                    入库 等待重试
														
 
															-                                    url     https://www.e-qyzc.com/gg/toXinXiList
														
 
															-                                    重试次数 2
														
 
															-                                    最大允许重试次数 2
														
--- a/NoteWork/cesspider/黔云招采电子招标采购交易平台.py
+++ b/NoteWork/cesspider/黔云招采电子招标采购交易平台.py
@@ -1,93 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-01-10 09:47:56
														
 
															----------
														
 
															-@summary:	黔云招采电子招标采购交易平台
														
 
															----------
														
 
															-@author: topnet
														
 
															-"""
														
 
															-
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem,ListItem
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-
														
 
															-class Qyzcdzzbcgjypt(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         self.count = 0
														
 
															-         self.site='黔云招采电子招标采购交易平台'
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('询价采购-采购公告', 'gz_qyzcdzzbcgjypt_xjcg_cggg', "Notice", 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-             for page in range(1,menu.crawl_page+1):
														
 
															-                 start_url = f'https://www.e-qyzc.com/gg/toXinXiList?gongGaoType=5'
														
 
															-                 data = {
														
 
															-                    "currentPage": str(page),
														
 
															-                    "xmBH": "",
														
 
															-                    "ggName": "",
														
 
															-                    "hangYeType": "",
														
 
															-                    "zbrName": "",
														
 
															-                    "zbdlName": ""
														
 
															-                 }
														
 
															-                 yield feapder.Request(url=start_url, item=menu._asdict(), proxies=False, data=data,method="POST")
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        menu = request.item
														
 
															-        self.count += 1   # 一个计数器
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.xpath('//table[@id="p1"]/tr[position()>1]')
														
 
															-        for info in info_list:
														
 
															-            href = info.xpath('./td/a/@href').extract_first().strip()
														
 
															-            title = info.xpath('./td/a/text()').extract_first().strip()
														
 
															-            create_time = info.xpath('./td[5]/text()').extract_first().strip()
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = self.site
														
 
															-            data_item.area = "贵州省"  # 城市默认:全国
														
 
															-            data_item.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            # if ss == []:
														
 
															-            #     continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details_firefox"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//div[@class="page_contect bai_bg"]']
														
 
															-            if "guid" not in href:
														
 
															-                continue
														
 
															-            uid = href.split("guid=")[-1].split("&")[0]
														
 
															-            list_item.parse_url = f"https://www.e-qyzc.com/waiburukou/xjcgGongGao/view/{uid}.html"
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-        list = ListItem()
														
 
															-        list.site = self.site
														
 
															-        list.channel = menu.get("channel")
														
 
															-        list.spidercode = menu.get("code")
														
 
															-        list.url = request.url
														
 
															-        list.count = len(info_list)
														
 
															-        list.rel_count = len(href_list)
														
 
															-        yield list
														
 
															-
														
 
															-    # def exception_request(self, request, response):
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        # list = ListItem()
														
 
															-        # list.site=
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Qyzcdzzbcgjypt(redis_key="fwork:Qyzcdzzbcgjypt2").start()
														
--- a/NoteWork/details/__init__.py
+++ b/NoteWork/details/__init__.py
@@ -1,15 +0,0 @@
 
															-import requests
														
 
															-
														
 
															-
														
 
															-headers = {
														
 
															-
														
 
															-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
														
 
															-}
														
 
															-cookies = {
														
 
															-    "__jsluid_h": "018c23a4fee58c26aa118512640f8022"
														
 
															-}
														
 
															-url = "http://www.snszgh.gov.cn/gsgg/index.html"
														
 
															-response = requests.get(url, headers=headers,verify=False)
														
 
															-
														
 
															-print(response.text)
														
 
															-print(response)
														
--- a/NoteWork/details/detail_dtcookie.py
+++ b/NoteWork/details/detail_dtcookie.py
@@ -1,194 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:  生成一定有效期cookie，并使用的detail 详情处理方案，默认不限制ip
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import sys
														
 
															-from urllib.parse import urljoin
														
 
															-
														
 
															-from untils.attachment import AttachmentDownloader
														
 
															-
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-import execjs
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-from dtcookie_pool import *
														
 
															-
														
 
															-from untils.cookie_pool import PageCookiePool
														
 
															-import copy
														
 
															-
														
 
															-class Details(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details","item.site":"合肥市人民政府"},sort={"date":-1},limit=1)
														
 
															-            for item in data_lsit:
														
 
															-                request_params = item.get("request_params")
														
 
															-                if item.get("ex_python"):
														
 
															-                    exec(item.get("ex_python"))
														
 
															-                if item.get("proxies"):
														
 
															-
														
 
															-                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"),deal_detail=item.get("deal_detail"),
														
 
															-                                          callback=eval(item.get("parse")), base_info=item, files_info=item.get("files"),
														
 
															-                                          down_mid=item.get("down_mid"), **request_params)
														
 
															-                else:
														
 
															-                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files=item.get("files"),
														
 
															-                                          deal_detail=item.get("deal_detail"),down_mid=item.get("down_mid"), files_info=item.get("files"),
														
 
															-                                          callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
														
 
															-
														
 
															-
														
 
															-                self.to_db.delete(self.db_name,item)
														
 
															-            break
														
 
															-
														
 
															-
														
 
															-
														
 
															-    def detail_get(self,request,response):
														
 
															-        '''处理html格式的返回结果'''
														
 
															-        if response.code in (request.down_mid.get("code")):
														
 
															-            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = request.down_mid
														
 
															-            cookie_pool_class = down_mid.get("cookie_pool")
														
 
															-            cookie_pool = eval(cookie_pool_class)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        for xpath in request.deal_detail:
														
 
															-            html = response.xpath(xpath).extract_first()  # 标书详细内容
														
 
															-            if html is not None:
														
 
															-                break
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        if request.files_info:
														
 
															-            files_info = request.files_info
														
 
															-            files = response.xpath(files_info.get("list_xpath"))
														
 
															-            if request.files_info:
														
 
															-                files_info = request.files_info
														
 
															-                files = response.xpath(files_info.get("list_xpath"))
														
 
															-                if len(files) > 0:
														
 
															-                    attachments = {}
														
 
															-                    for index, info in enumerate(files):
														
 
															-                        file_url = info.xpath(files_info.get("url_xpath")).extract_first()
														
 
															-                        file_name = info.xpath(files_info.get("name_xpath")).extract_first()
														
 
															-                        if files_info.get("host"):
														
 
															-                            file_url = urljoin(files_info.get("host"), file_url)
														
 
															-                        if not files_info.get("file_type"):
														
 
															-                            file_type = file_url.split("?")[0].split(".")[-1].lower()
														
 
															-                        else:
														
 
															-                            file_type = files_info.get("file_type")
														
 
															-                        if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
														
 
															-                            attachment = AttachmentDownloader().fetch_attachment(
														
 
															-                                file_name=file_name, file_type=file_type, download_url=file_url,
														
 
															-                                enable_proxy=False)
														
 
															-                            attachments[len(attachments) + 1] = attachment
														
 
															-                    if len(attachments) == 0:
														
 
															-                        pass
														
 
															-                    else:
														
 
															-                        list_item.projectinfo = {"attachment": attachments}
														
 
															-        yield list_item
														
 
															-
														
 
															-    def detail_json(self,request,response):
														
 
															-        '''处理json串及其他格式的返回结果'''
														
 
															-        if response.code in (request.down_mid.get("code")):
														
 
															-            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = request.down_mid
														
 
															-            cookie_pool_class = down_mid.get("cookie_pool")
														
 
															-            cookie_pool = eval(cookie_pool_class)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        exec(request.deal_detail)
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        if response is None:
														
 
															-            code = 0
														
 
															-        else:
														
 
															-            code = response.status_code
														
 
															-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
														
 
															-        if 200 <= code < 300:
														
 
															-            err = 'analysis'
														
 
															-        elif 300 <= code < 400:
														
 
															-            err = 'download'
														
 
															-        elif 400 <= code < 500:
														
 
															-            err = 'download'
														
 
															-        elif 500 <= code:
														
 
															-            err = "servers"
														
 
															-        else:
														
 
															-            err = "timeout"
														
 
															-        mgp = MgpListItem()
														
 
															-        mgp.code = code
														
 
															-        mgp.error = err
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key, items[key])
														
 
															-        mgp.failed += 1
														
 
															-        if mgp.pri is None:
														
 
															-            mgp.pri = 0
														
 
															-
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info = f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **爬虫等级:** {mgp.pri}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-    def download_midware(self, request):
														
 
															-        headers = {
														
 
															-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
														
 
															-            "Accept-Encoding": "gzip, deflate, br",
														
 
															-            "Accept-Language": "zh-CN,zh;q=0.9",
														
 
															-            "Cache-Control": "max-age=0",
														
 
															-            "Connection": "keep-alive",
														
 
															-            "Host": "www.hefei.gov.cn",
														
 
															-            "Upgrade-Insecure-Requests": "1",
														
 
															-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
														
 
															-        }
														
 
															-        down_mid = request.down_mid
														
 
															-        cookie_pool_class = down_mid.get("cookie_pool")
														
 
															-        cookie_pool = eval(cookie_pool_class)
														
 
															-        request.cookies = cookie_pool.get_cookie()
														
 
															-        request.headers=headers
														
 
															-        return request
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Details(redis_key="magp:details1").start()
														
--- a/NoteWork/details/detail_ztlbw.py
+++ b/NoteWork/details/detail_ztlbw.py
@@ -1,134 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-
														
 
															-import feapder
														
 
															-from feapder.utils.log import Log
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-from login_pool.zglbw import ZglbwPool
														
 
															-from untils.attachment import AttachmentDownloader
														
 
															-
														
 
															-Log().info("")
														
 
															-
														
 
															-
														
 
															-class FirefoxDetails(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name, {"parser_name": "details_ztlbw", "item.spidercode": "a_ztlbsww_jzxtp"},
														
 
															-                                        sort={"date": -1}, limit=1)
														
 
															-            print(data_lsit)
														
 
															-            for item in data_lsit:
														
 
															-                url = item.get("parse_url")
														
 
															-                url = "https://eproport.crecgec.com/#/notice/notice-detail?projectId=1484412339522916354&tenantId=1&indexnumber=0"
														
 
															-                cookie = ZglbwPool(table_userbase='zglbw', redis_key='zglbw')
														
 
															-                cookie = cookie.get_cookie().cookie
														
 
															-                yield feapder.Request(url=url, item=item.get("item"),
														
 
															-                                      callback=self.detail_get, base_info=item, render=True,
														
 
															-                                      render_time=3, proxies=False, cookies=cookie)
														
 
															-                self.to_db.delete(self.db_name, item)
														
 
															-            break
														
 
															-
														
 
															-    def detail_get(self, request, response):
														
 
															-        items = request.item
														
 
															-        # print(items)
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key, items[key])
														
 
															-        html = ''
														
 
															-        xpath_list = ['//div[@class="ant-col ant-col-xs-6 ant-col-sm-6 ant-col-lg-12"][1]',
														
 
															-                      '//div[@class="luban-bid-details ant-row ng-star-inserted"][2]',
														
 
															-                      '//div[@class="login ng-star-inserted"]']
														
 
															-        for xpath in xpath_list:
														
 
															-            # import pdb
														
 
															-            # pdb.set_trace()
														
 
															-            html_one = response.xpath(xpath).extract_first()
														
 
															-            if html_one is not None:
														
 
															-                html += '\n'  # 标书详细内容
														
 
															-                html += html_one  # 拼接html
														
 
															-        print(html)
														
 
															-        list_item.contenthtml = html
														
 
															-        files_list = response.xpath("//iframe/@src").extract_first()
														
 
															-        file_url = files_list.split("file=")[-1]
														
 
															-        file_url = file_url.replace("%3A", ":").replace("%2F", "/").replace("%3F", "?").replace("%3D", "=")
														
 
															-        attachments = {}
														
 
															-        file_name = list_item.title
														
 
															-
														
 
															-        attachment = AttachmentDownloader().fetch_attachment(
														
 
															-            file_name=file_name, file_type='pdf', download_url=file_url,
														
 
															-            enable_proxy=False)
														
 
															-        attachments["0"] = attachment
														
 
															-        list_item.projectinfo = {"attachments": attachments}
														
 
															-        yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        if response is None:
														
 
															-            code = 0
														
 
															-        code = response.status_code
														
 
															-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
														
 
															-        if 200 <= code < 300:
														
 
															-            err = 'analysis'
														
 
															-        elif 300 <= code < 400:
														
 
															-            err = 'download'
														
 
															-        elif 400 <= code < 500:
														
 
															-            err = 'download'
														
 
															-        elif 500 <= code:
														
 
															-            err = "servers"
														
 
															-        else:
														
 
															-            err = "timeout"
														
 
															-        mgp = MgpListItem()
														
 
															-        mgp.code = code
														
 
															-        mgp.error = err
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key, items[key])
														
 
															-        mgp.failed += 1
														
 
															-        if mgp.pri is None:
														
 
															-            mgp.pri = 0
														
 
															-
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info = f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **爬虫等级:** {mgp.pri}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
														
 
															-    # def download_midware(self, request):
														
 
															-    #     request.proxies = self.prox_pool.get()
														
 
															-    #     return request
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    FirefoxDetails(redis_key="magp:details:ztlbw").start()
														
--- a/NoteWork/details/details
+++ b/NoteWork/details/details
@@ -1,1082 +0,0 @@
 
															-Thread-5|2022-01-28 17:06:38,101|parser_control.py|run|line:56|DEBUG| parser 等待任务...
														
 
															-Details|2022-01-28 17:06:38,102|scheduler.py|<lambda>|line:112|INFO| 
														
 
															-********** feapder begin **********
														
 
															-Details|2022-01-28 17:06:38,103|scheduler.py|__add_task|line:215|INFO| 检查到有待做任务 8 条，不重下发新任务，将接着上回异常终止处继续抓取
														
 
															-Thread-4|2022-01-28 17:06:47,221|collector.py|__input_data|line:108|INFO| 重置丢失任务完毕，共8条
														
 
															-Thread-5|2022-01-28 17:06:48,223|request.py|get_response|line:305|DEBUG| 
														
 
															-                -------------- Details.detail_get request for ----------------
														
 
															-                url  = http://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c/
														
 
															-                method = GET
														
 
															-                body = {'files': {'list_xpath': '//div[@id="fjxz"]/p[@class="mar-L30 fjwz"]/a|//div[@id="result"]//u/a|//div[@class="TRS_Editor"]//p/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp'], 'file_type': 'doxc', 'url_key': 'http', 'host': 'http://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'}}
														
 
															-                
														
 
															-Thread-5|2022-01-28 17:06:48,270|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
														
 
															-Thread-5|2022-01-28 17:06:48,270|parser_control.py|deal_requests|line:249|ERROR| 
														
 
															-                            -------------- Details.detail_get error -------------
														
 
															-                            error          HTTPConnectionPool(host='cz.fjzfcg.gov.cn', port=80): Max retries exceeded with url: /3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000259309BDAF0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
														
 
															-                            response       None
														
 
															-                            deal request   {'base_info': b'\x80\x04\x95S\x05\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03_id\x94'
														
 
															-              b'\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93\x94)\x81\x94'
														
 
															-              b'C\x0ca\xf3a\xae\x95G\xb8\xb7\xd1\r\xc04\x94b\x8c\x05parse\x94'
														
 
															-              b'\x8c\x0fself.detail_get\x94\x8c\x04item\x94}\x94(\x8c\x05ti'
														
 
															-              b'tle\x94\x8c]\xe4\xb9\xa1\xe9\x95\x87\xe6\x95\xac\xe8'
														
 
															-              b'\x80\x81\xe9\x99\xa2\xe5\xba\x8a\xe4\xbd\x8d\xe4'
														
 
															-              b'\xbd\xbf\xe7\x94\xa8\xe7\x8e\x87\xe8\xbe\xbe\xe6'
														
 
															-              b'\xa0\x87\xe5\x8e\xbf\xef\xbc\x88\xe5\xb8\x82\xe3'
														
 
															-              b'\x80\x81\xe5\x8c\xba\xef\xbc\x89\xe7\xac\xac\xe4'
														
 
															-              b'\xb8\x89\xe6\x96\xb9\xe8\xaf\x84\xe4\xbc\xb0\xe9'
														
 
															-              b'\x87\x87\xe8\xb4\xad\xe9\xa1\xb9\xe7\x9b\xae\xe9'
														
 
															-              b'\x87\x87\xe8\xb4\xad\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x0bpublis'
														
 
															-              b'htime\x94\x8c\x132019-07-17 16:14:02\x94\x8c\nspidercode'
														
 
															-              b'\x94\x8c\x0efj_fjsmzt_tzgg\x94\x8c\x04site\x94\x8c\x12\xe7'
														
 
															-              b'\xa6\x8f\xe5\xbb\xba\xe7\x9c\x81\xe6\xb0\x91\xe6'
														
 
															-              b'\x94\xbf\xe5\x8e\x85\x94\x8c\x07channel\x94\x8c\x0c\xe9\x80'
														
 
															-              b'\x9a\xe7\x9f\xa5\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x04are'
														
 
															-              b'a\x94\x8c\x06\xe7\xa6\x8f\xe5\xbb\xba\x94\x8c\x04cit'
														
 
															-              b'y\x94\x8c\x00\x94\x8c\x0bcompetehref\x94N\x8c\x04href\x94\x8c'
														
 
															-              b'fhttp://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672'
														
 
															-              b'964c633ce/7c36067afe5b449ea66bae09d11cf45c/\x94\x8c\x0bpublis'
														
 
															-              b'hdept\x94h\x18\x8c\tiscompete\x94\x88\x8c\x04type\x94'
														
 
															-              b'h\x18\x8c\x01T\x94\x8c\x07bidding\x94\x8c\x10l_np_publishti'
														
 
															-              b'me\x94h\x18\x8c\ncomeintime\x94h\x18\x8c\x08sendflag\x94\x8c'
														
 
															-              b'\x05false\x94\x8c\x02_d\x94\x8c\ncomeintime\x94\x8c\x0bconte'
														
 
															-              b'nthtml\x94h\x18\x8c\x06detail\x94h\x18\x8c\x0bprojectinfo\x94Nu'
														
 
															-              b'\x8c\x0bparser_name\x94\x8c\x07details\x94\x8c\x04date\x94\x8c'
														
 
															-              b'\x132022-01-28 11:23:26\x94\x8c\x0bdeal_detail\x94]'
														
 
															-              b'\x94(\x8c\x17//div[@class="xl_main"]\x94\x8c\x19//div[@class="'
														
 
															-              b'big-box-B"]\x94e\x8c\x0bcreate_time\x94N\x8c\tparse_url\x94'
														
 
															-              b'\x8cfhttp://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a63367'
														
 
															-              b'2964c633ce/7c36067afe5b449ea66bae09d11cf45c/\x94\x8c\x0ereque'
														
 
															-              b'st_params\x94}\x94\x8c\x06failed\x94K\x04\x8c\x06author\x94'
														
 
															-              b'\x8c\x07details\x94\x8c\x05ex_js\x94h\x18\x8c\tex_python\x94'
														
 
															-              b'N\x8c\x03pri\x94K\x01\x8c\x07proxies\x94\x89\x8c\x05files\x94'
														
 
															-              b'}\x94(\x8c\nlist_xpath\x94\x8ce//div[@id="fjxz"]/p[@class="ma'
														
 
															-              b'r-L30 fjwz"]/a|//div[@id="result"]//u/a|//div[@class="TRS_Ed'
														
 
															-              b'itor"]//p/a\x94\x8c\turl_xpath\x94\x8c\x07./@href\x94\x8c\nname'
														
 
															-              b'_xpath\x94\x8c\x08./text()\x94\x8c\nfiles_type\x94]'
														
 
															-              b'\x94(\x8c\x03zip\x94\x8c\x04doxc\x94\x8c\x03ftp\x94e\x8c\tfile'
														
 
															-              b'_type\x94\x8c\x04doxc\x94\x8c\x07url_key\x94\x8c\x04htt'
														
 
															-              b'p\x94\x8c\x04host\x94\x8cehttp://cz.fjzfcg.gov.cn/3500/notice/1'
														
 
															-              b'c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11c'
														
 
															-              b'f45c\x94u\x8c\x05error\x94N\x8c\x04code\x94K\x00u.',
														
 
															- 'callback': 'detail_get',
														
 
															- 'deal_detail': b'\x80\x04\x95;\x00\x00\x00\x00\x00\x00\x00]\x94(\x8c\x17//di'
														
 
															-                b'v[@class="xl_main"]\x94\x8c\x19//div[@class="big-box-B"]\x94'
														
 
															-                b'e.',
														
 
															- 'error_msg': 'requests.exceptions.ConnectionError: '
														
 
															-              "HTTPConnectionPool(host='cz.fjzfcg.gov.cn', port=80): Max "
														
 
															-              'retries exceeded with url: '
														
 
															-              '/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c/ '
														
 
															-              '(Caused by '
														
 
															-              "NewConnectionError('<urllib3.connection.HTTPConnection object "
														
 
															-              'at 0x0000016835E6B850>: Failed to establish a new connection: '
														
 
															-              "[Errno 11001] getaddrinfo failed'))",
														
 
															- 'files': {'file_type': 'doxc',
														
 
															-           'files_type': ['zip', 'doxc', 'ftp'],
														
 
															-           'host': 'http://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c',
														
 
															-           'list_xpath': '//div[@id="fjxz"]/p[@class="mar-L30 '
														
 
															-                         'fjwz"]/a|//div[@id="result"]//u/a|//div[@class="TRS_Editor"]//p/a',
														
 
															-           'name_xpath': './text()',
														
 
															-           'url_key': 'http',
														
 
															-           'url_xpath': './@href'},
														
 
															- 'filter_repeat': False,
														
 
															- 'item': b'\x80\x04\x95$\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
														
 
															-         b'e\x94\x8c]\xe4\xb9\xa1\xe9\x95\x87\xe6\x95\xac\xe8\x80\x81'
														
 
															-         b'\xe9\x99\xa2\xe5\xba\x8a\xe4\xbd\x8d\xe4\xbd\xbf\xe7\x94\xa8\xe7'
														
 
															-         b'\x8e\x87\xe8\xbe\xbe\xe6\xa0\x87\xe5\x8e\xbf\xef\xbc\x88\xe5\xb8'
														
 
															-         b'\x82\xe3\x80\x81\xe5\x8c\xba\xef\xbc\x89\xe7\xac\xac\xe4\xb8\x89'
														
 
															-         b'\xe6\x96\xb9\xe8\xaf\x84\xe4\xbc\xb0\xe9\x87\x87\xe8\xb4\xad\xe9'
														
 
															-         b'\xa1\xb9\xe7\x9b\xae\xe9\x87\x87\xe8\xb4\xad\xe5\x85\xac\xe5\x91'
														
 
															-         b'\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132019-07-17 16:14:02\x94\x8c\n'
														
 
															-         b'spidercode\x94\x8c\x0efj_fjsmzt_tzgg\x94\x8c\x04site\x94\x8c'
														
 
															-         b'\x12\xe7\xa6\x8f\xe5\xbb\xba\xe7\x9c\x81\xe6\xb0\x91\xe6\x94\xbf'
														
 
															-         b'\xe5\x8e\x85\x94\x8c\x07channel\x94\x8c\x0c\xe9\x80\x9a\xe7'
														
 
															-         b'\x9f\xa5\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x04area\x94\x8c\x06\xe7\xa6'
														
 
															-         b'\x8f\xe5\xbb\xba\x94\x8c\x04city\x94\x8c\x00\x94\x8c\x0bcompetehref'
														
 
															-         b'\x94N\x8c\x04href\x94\x8cfhttp://cz.fjzfcg.gov.cn/3500/notice/1c4f9'
														
 
															-         b'44709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c'
														
 
															-         b'/\x94\x8c\x0bpublishdept\x94h\x0e\x8c\tiscompete\x94\x88\x8c\x04typ'
														
 
															-         b'e\x94h\x0e\x8c\x01T\x94\x8c\x07bidding\x94\x8c\x10l_np_publishtime'
														
 
															-         b'\x94h\x0e\x8c\ncomeintime\x94h\x0e\x8c\x08sendflag\x94\x8c\x05false'
														
 
															-         b'\x94\x8c\x02_d\x94\x8c\ncomeintime\x94\x8c\x0bcontenthtml'
														
 
															-         b'\x94h\x0e\x8c\x06detail\x94h\x0e\x8c\x0bprojectinfo\x94Nu.',
														
 
															- 'parser_name': 'Details',
														
 
															- 'proxies': False,
														
 
															- 'response': 'None',
														
 
															- 'retry_times': 2,
														
 
															- 'url': 'http://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c/'}
														
 
															-                            
														
 
															-Thread-5|2022-01-28 17:06:48,294|request.py|get_response|line:305|DEBUG| 
														
 
															-                -------------- Details.detail_get request for ----------------
														
 
															-                url  = http://cz.fjzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8/
														
 
															-                method = GET
														
 
															-                body = {'files': {'list_xpath': '//div[@id="fjxz"]/p[@class="mar-L30 fjwz"]/a|//div[@id="result"]//u/a|//div[@class="TRS_Editor"]//p/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp'], 'file_type': 'doxc', 'url_key': 'http', 'host': 'http://cz.fjzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36'}}
														
 
															-                
														
 
															-Thread-5|2022-01-28 17:06:48,333|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
														
 
															-Thread-5|2022-01-28 17:06:48,334|parser_control.py|deal_requests|line:249|ERROR| 
														
 
															-                            -------------- Details.detail_get error -------------
														
 
															-                            error          HTTPConnectionPool(host='cz.fjzfcg.gov.cn', port=80): Max retries exceeded with url: /3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000259309ECA30>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
														
 
															-                            response       None
														
 
															-                            deal request   {'base_info': b'\x80\x04\x955\x05\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03_id\x94'
														
 
															-              b'\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93\x94)\x81\x94'
														
 
															-              b'C\x0ca\xf3a\xae\x95G\xb8\xb7\xd1\r\xc0B\x94b\x8c\x05parse\x94'
														
 
															-              b'\x8c\x0fself.detail_get\x94\x8c\x04item\x94}\x94(\x8c\x05ti'
														
 
															-              b'tle\x94\x8c?\xe7\xa6\x8f\xe5\xbb\xba\xe7\x9c\x81\xe5'
														
 
															-              b'\x85\xbb\xe8\x80\x81\xe6\x9c\x8d\xe5\x8a\xa1\xe7'
														
 
															-              b'\xbb\xbc\xe5\x90\x88\xe4\xbf\xa1\xe6\x81\xaf\xe5'
														
 
															-              b'\xb9\xb3\xe5\x8f\xb0\xe9\x87\x87\xe8\xb4\xad\xe9'
														
 
															-              b'\xa1\xb9\xe7\x9b\xae\xe6\x8b\x9b\xe6\xa0\x87\xe5'
														
 
															-              b'\x85\xac\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132019-0'
														
 
															-              b'5-22 16:01:08\x94\x8c\nspidercode\x94\x8c\x0efj_fjsmzt_tzgg\x94'
														
 
															-              b'\x8c\x04site\x94\x8c\x12\xe7\xa6\x8f\xe5\xbb\xba\xe7'
														
 
															-              b'\x9c\x81\xe6\xb0\x91\xe6\x94\xbf\xe5\x8e\x85\x94\x8c\x07channe'
														
 
															-              b'l\x94\x8c\x0c\xe9\x80\x9a\xe7\x9f\xa5\xe5\x85\xac\xe5\x91\x8a'
														
 
															-              b'\x94\x8c\x04area\x94\x8c\x06\xe7\xa6\x8f\xe5\xbb\xba'
														
 
															-              b'\x94\x8c\x04city\x94\x8c\x00\x94\x8c\x0bcompetehref'
														
 
															-              b'\x94N\x8c\x04href\x94\x8cfhttp://cz.fjzfcg.gov.cn/3500/notice/d'
														
 
															-              b'2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da'
														
 
															-              b'31a8/\x94\x8c\x0bpublishdept\x94h\x18\x8c\tiscompet'
														
 
															-              b'e\x94\x88\x8c\x04type\x94h\x18\x8c\x01T\x94\x8c\x07biddin'
														
 
															-              b'g\x94\x8c\x10l_np_publishtime\x94h\x18\x8c\ncomeintime\x94'
														
 
															-              b'h\x18\x8c\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\n'
														
 
															-              b'comeintime\x94\x8c\x0bcontenthtml\x94h\x18\x8c\x06detail\x94'
														
 
															-              b'h\x18\x8c\x0bprojectinfo\x94Nu\x8c\x0bparser_name\x94\x8c\x07de'
														
 
															-              b'tails\x94\x8c\x04date\x94\x8c\x132022-01-28 11:23:26\x94\x8c'
														
 
															-              b'\x0bdeal_detail\x94]\x94(\x8c\x17//div[@class="xl_main"'
														
 
															-              b']\x94\x8c\x19//div[@class="big-box-B"]\x94e\x8c\x0bcreate_time'
														
 
															-              b'\x94N\x8c\tparse_url\x94\x8cfhttp://cz.fjzfcg.gov.cn/3500/not'
														
 
															-              b'ice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877'
														
 
															-              b'162da31a8/\x94\x8c\x0erequest_params\x94}\x94\x8c\x06fail'
														
 
															-              b'ed\x94K\x04\x8c\x06author\x94\x8c\x07details\x94\x8c\x05ex'
														
 
															-              b'_js\x94h\x18\x8c\tex_python\x94N\x8c\x03pri\x94K\x01\x8c\x07pro'
														
 
															-              b'xies\x94\x89\x8c\x05files\x94}\x94(\x8c\nlist_xpath\x94\x8ce'
														
 
															-              b'//div[@id="fjxz"]/p[@class="mar-L30 fjwz"]/a|//div[@id="resu'
														
 
															-              b'lt"]//u/a|//div[@class="TRS_Editor"]//p/a\x94\x8c\turl_xpat'
														
 
															-              b'h\x94\x8c\x07./@href\x94\x8c\nname_xpath\x94\x8c\x08./tex'
														
 
															-              b't()\x94\x8c\nfiles_type\x94]\x94(\x8c\x03zip\x94\x8c\x04doxc'
														
 
															-              b'\x94\x8c\x03ftp\x94e\x8c\tfile_type\x94\x8c\x04doxc\x94\x8c'
														
 
															-              b'\x07url_key\x94\x8c\x04http\x94\x8c\x04host\x94\x8cehttp://cz.f'
														
 
															-              b'jzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91'
														
 
															-              b'255ff3752c4bc48770877162da31a8\x94u\x8c\x05error\x94N\x8c\x04c'
														
 
															-              b'ode\x94K\x00u.',
														
 
															- 'callback': 'detail_get',
														
 
															- 'deal_detail': b'\x80\x04\x95;\x00\x00\x00\x00\x00\x00\x00]\x94(\x8c\x17//di'
														
 
															-                b'v[@class="xl_main"]\x94\x8c\x19//div[@class="big-box-B"]\x94'
														
 
															-                b'e.',
														
 
															- 'error_msg': 'requests.exceptions.ConnectionError: '
														
 
															-              "HTTPConnectionPool(host='cz.fjzfcg.gov.cn', port=80): Max "
														
 
															-              'retries exceeded with url: '
														
 
															-              '/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8/ '
														
 
															-              '(Caused by '
														
 
															-              "NewConnectionError('<urllib3.connection.HTTPConnection object "
														
 
															-              'at 0x0000016835E877F0>: Failed to establish a new connection: '
														
 
															-              "[Errno 11001] getaddrinfo failed'))",
														
 
															- 'files': {'file_type': 'doxc',
														
 
															-           'files_type': ['zip', 'doxc', 'ftp'],
														
 
															-           'host': 'http://cz.fjzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8',
														
 
															-           'list_xpath': '//div[@id="fjxz"]/p[@class="mar-L30 '
														
 
															-                         'fjwz"]/a|//div[@id="result"]//u/a|//div[@class="TRS_Editor"]//p/a',
														
 
															-           'name_xpath': './text()',
														
 
															-           'url_key': 'http',
														
 
															-           'url_xpath': './@href'},
														
 
															- 'filter_repeat': False,
														
 
															- 'item': b'\x80\x04\x95\x06\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
														
 
															-         b'e\x94\x8c?\xe7\xa6\x8f\xe5\xbb\xba\xe7\x9c\x81\xe5\x85\xbb'
														
 
															-         b'\xe8\x80\x81\xe6\x9c\x8d\xe5\x8a\xa1\xe7\xbb\xbc\xe5\x90\x88\xe4'
														
 
															-         b'\xbf\xa1\xe6\x81\xaf\xe5\xb9\xb3\xe5\x8f\xb0\xe9\x87\x87\xe8\xb4'
														
 
															-         b'\xad\xe9\xa1\xb9\xe7\x9b\xae\xe6\x8b\x9b\xe6\xa0\x87\xe5\x85\xac'
														
 
															-         b'\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132019-05-22 16:01'
														
 
															-         b':08\x94\x8c\nspidercode\x94\x8c\x0efj_fjsmzt_tzgg\x94\x8c\x04site'
														
 
															-         b'\x94\x8c\x12\xe7\xa6\x8f\xe5\xbb\xba\xe7\x9c\x81\xe6\xb0\x91\xe6'
														
 
															-         b'\x94\xbf\xe5\x8e\x85\x94\x8c\x07channel\x94\x8c\x0c\xe9\x80'
														
 
															-         b'\x9a\xe7\x9f\xa5\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x04area\x94\x8c\x06'
														
 
															-         b'\xe7\xa6\x8f\xe5\xbb\xba\x94\x8c\x04city\x94\x8c\x00\x94\x8c\x0bc'
														
 
															-         b'ompetehref\x94N\x8c\x04href\x94\x8cfhttp://cz.fjzfcg.gov.cn/3500/no'
														
 
															-         b'tice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31'
														
 
															-         b'a8/\x94\x8c\x0bpublishdept\x94h\x0e\x8c\tiscompete\x94\x88\x8c\x04t'
														
 
															-         b'ype\x94h\x0e\x8c\x01T\x94\x8c\x07bidding\x94\x8c\x10l_np_publishti'
														
 
															-         b'me\x94h\x0e\x8c\ncomeintime\x94h\x0e\x8c\x08sendflag\x94\x8c\x05fal'
														
 
															-         b'se\x94\x8c\x02_d\x94\x8c\ncomeintime\x94\x8c\x0bcontenthtml\x94h'
														
 
															-         b'\x0e\x8c\x06detail\x94h\x0e\x8c\x0bprojectinfo\x94Nu.',
														
 
															- 'parser_name': 'Details',
														
 
															- 'proxies': False,
														
 
															- 'response': 'None',
														
 
															- 'retry_times': 2,
														
 
															- 'url': 'http://cz.fjzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8/'}
														
 
															-                            
														
 
															-Thread-5|2022-01-28 17:06:48,380|request.py|get_response|line:305|DEBUG| 
														
 
															-                -------------- Details.detail_get request for ----------------
														
 
															-                url  = https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7e3a9c7e946b44017e9f51af707454.html
														
 
															-                method = GET
														
 
															-                body = {'files': {'list_xpath': '//div[@class="info-article in active"]//div/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp', 'pdf'], 'url_key': 'http'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36'}}
														
 
															-                
														
 
															-Thread-5|2022-01-28 17:06:48,394|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
														
 
															-Thread-5|2022-01-28 17:06:48,395|parser_control.py|deal_requests|line:249|ERROR| 
														
 
															-                            -------------- Details.detail_get error -------------
														
 
															-                            error          dictionary update sequence element #0 has length 1; 2 is required
														
 
															-                            response       None
														
 
															-                            deal request   {'base_info': b'\x80\x04\x95\x84\x04\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03'
														
 
															-              b'_id\x94\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93'
														
 
															-              b'\x94)\x81\x94C\x0ca\xf3\xa1a\x81\xdbV\xa5\x9f\xf9hq\x94b'
														
 
															-              b'\x8c\x05parse\x94\x8c\x0fself.detail_get\x94\x8c\x04item'
														
 
															-              b'\x94}\x94(\x8c\x05title\x94\x8cQ\xe5\xb9\xbf\xe4\xb8\x9c'
														
 
															-              b'\xe8\xbd\xbb\xe5\xb7\xa5\xe8\x81\x8c\xe4\xb8\x9a'
														
 
															-              b'\xe6\x8a\x80\xe6\x9c\xaf\xe5\xad\xa6\xe9\x99\xa2'
														
 
															-              b'\xe6\x96\xb0\xe8\x83\xbd\xe6\xba\x90\xe6\xb1\xbd'
														
 
															-              b'\xe8\xbd\xa6\xe6\xa3\x80\xe6\xb5\x8b\xe5\xae\x9e'
														
 
															-              b'\xe8\xae\xad\xe8\xae\xbe\xe5\xa4\x87\xe8\xb4\xad'
														
 
															-              b'\xe7\xbd\xae\xe6\x8b\x9b\xe6\xa0\x87\xe5\x85\xac'
														
 
															-              b'\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132022-01-28 15:09'
														
 
															-              b':43\x94\x8c\nspidercode\x94\x8c\x13gd_gdszfcgwxwz_cggg\x94\x8c'
														
 
															-              b'\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c'
														
 
															-              b'\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4'
														
 
															-              b'\xad\xe7\xbd\x91\x94\x8c\x07channel\x94\x8c\x0c\xe9\x87\x87'
														
 
															-              b'\xe8\xb4\xad\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x04area'
														
 
															-              b'\x94\x8c\x06\xe5\xb9\xbf\xe4\xb8\x9c\x94\x8c\x04city'
														
 
															-              b'\x94\x8c\x00\x94\x8c\x0bcompetehref\x94N\x8c\x04href\x94\x8c`'
														
 
															-              b'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8'
														
 
															-              b'a7e3a9c7e946b44017e9f51af707454.html\x94\x8c\x0bpublishde'
														
 
															-              b'pt\x94h\x18\x8c\tiscompete\x94\x88\x8c\x04type\x94h\x18\x8c'
														
 
															-              b'\x01T\x94\x8c\x07bidding\x94\x8c\x10l_np_publishtime\x94'
														
 
															-              b'h\x18\x8c\ncomeintime\x94h\x18\x8c\x08sendflag\x94\x8c\x05fa'
														
 
															-              b'lse\x94\x8c\x02_d\x94\x8c\ncomeintime\x94\x8c\x0bcontenth'
														
 
															-              b'tml\x94h\x18\x8c\x06detail\x94h\x18\x8c\x0bprojectinfo\x94N'
														
 
															-              b'u\x8c\x0bparser_name\x94\x8c\x07details\x94\x8c\x04date\x94'
														
 
															-              b'\x8c\x132022-01-28 15:55:12\x94\x8c\x0bdeal_detail\x94'
														
 
															-              b']\x94\x8c&//div[@class="info-article in active"]\x94a\x8c\x0bcr'
														
 
															-              b'eate_time\x94N\x8c\tparse_url\x94\x8c`https://gdgpo.czt.gd.go'
														
 
															-              b'v.cn/freecms/site/gd/ggxx/info/2022/8a7e3a9c7e946b44017e9f51'
														
 
															-              b'af707454.html\x94\x8c\x0erequest_params\x94}\x94\x8c\x06faile'
														
 
															-              b'd\x94K\x02\x8c\x06author\x94\x8c\x07details\x94\x8c\x05ex_'
														
 
															-              b'js\x94h\x18\x8c\tex_python\x94N\x8c\x03pri\x94K\x01\x8c\x07prox'
														
 
															-              b'ies\x94\x89\x8c\x05files\x94}\x94(\x8c\nlist_xpath\x94\x8c-/'
														
 
															-              b'/div[@class="info-article in active"]//div/a\x94\x8c\turl_xpath'
														
 
															-              b'\x94\x8c\x07./@href\x94\x8c\nname_xpath\x94\x8c\x08./text'
														
 
															-              b'()\x94\x8c\nfiles_type\x94]\x94(\x8c\x03zip\x94\x8c\x04doxc\x94'
														
 
															-              b'\x8c\x03ftp\x94\x8c\x03pdf\x94e\x8c\x07url_key\x94\x8c\x04htt'
														
 
															-              b'p\x94u\x8c\x05error\x94N\x8c\x04code\x94K\x00u.',
														
 
															- 'callback': 'detail_get',
														
 
															- 'deal_detail': b'\x80\x04\x95-\x00\x00\x00\x00\x00\x00\x00]\x94\x8c&//div[@cl'
														
 
															-                b'ass="info-article in active"]\x94a.',
														
 
															- 'error_msg': 'ValueError: dictionary update sequence element #0 has length 1; '
														
 
															-              '2 is required',
														
 
															- 'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
														
 
															-           'list_xpath': '//div[@class="info-article in active"]//div/a',
														
 
															-           'name_xpath': './text()',
														
 
															-           'url_key': 'http',
														
 
															-           'url_xpath': './@href'},
														
 
															- 'filter_repeat': False,
														
 
															- 'item': b'\x80\x04\x95\x1d\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
														
 
															-         b'e\x94\x8cQ\xe5\xb9\xbf\xe4\xb8\x9c\xe8\xbd\xbb\xe5\xb7\xa5'
														
 
															-         b'\xe8\x81\x8c\xe4\xb8\x9a\xe6\x8a\x80\xe6\x9c\xaf\xe5\xad\xa6\xe9'
														
 
															-         b'\x99\xa2\xe6\x96\xb0\xe8\x83\xbd\xe6\xba\x90\xe6\xb1\xbd\xe8\xbd'
														
 
															-         b'\xa6\xe6\xa3\x80\xe6\xb5\x8b\xe5\xae\x9e\xe8\xae\xad\xe8\xae\xbe'
														
 
															-         b'\xe5\xa4\x87\xe8\xb4\xad\xe7\xbd\xae\xe6\x8b\x9b\xe6\xa0\x87\xe5'
														
 
															-         b'\x85\xac\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132022-01-28'
														
 
															-         b' 15:09:43\x94\x8c\nspidercode\x94\x8c\x13gd_gdszfcgwxwz_cggg'
														
 
															-         b'\x94\x8c\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c\x81\xe6'
														
 
															-         b'\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4\xad\xe7\xbd\x91\x94\x8c'
														
 
															-         b'\x07channel\x94\x8c\x0c\xe9\x87\x87\xe8\xb4\xad\xe5\x85\xac'
														
 
															-         b'\xe5\x91\x8a\x94\x8c\x04area\x94\x8c\x06\xe5\xb9\xbf\xe4\xb8\x9c\x94'
														
 
															-         b'\x8c\x04city\x94\x8c\x00\x94\x8c\x0bcompetehref\x94N\x8c\x04href\x94'
														
 
															-         b'\x8c`https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7'
														
 
															-         b'e3a9c7e946b44017e9f51af707454.html\x94\x8c\x0bpublishdept'
														
 
															-         b'\x94h\x0e\x8c\tiscompete\x94\x88\x8c\x04type\x94h\x0e\x8c\x01T'
														
 
															-         b'\x94\x8c\x07bidding\x94\x8c\x10l_np_publishtime\x94h\x0e\x8c\ncomein'
														
 
															-         b'time\x94h\x0e\x8c\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c'
														
 
															-         b'\ncomeintime\x94\x8c\x0bcontenthtml\x94h\x0e\x8c\x06detail'
														
 
															-         b'\x94h\x0e\x8c\x0bprojectinfo\x94Nu.',
														
 
															- 'parser_name': 'Details',
														
 
															- 'proxies': False,
														
 
															- 'response': 'None',
														
 
															- 'retry_times': 2,
														
 
															- 'url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7e3a9c7e946b44017e9f51af707454.html'}
														
 
															-                            
														
 
															-Thread-5|2022-01-28 17:06:48,446|request.py|get_response|line:305|DEBUG| 
														
 
															-                -------------- Details.detail_get request for ----------------
														
 
															-                url  = https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7edc7d7e9e62ca017e9f00529a7d80.html
														
 
															-                method = GET
														
 
															-                body = {'files': {'list_xpath': '//div[@class="info-article in active"]//div/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp', 'pdf'], 'url_key': 'http'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36'}}
														
 
															-                
														
 
															-Thread-5|2022-01-28 17:06:48,458|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
														
 
															-Thread-5|2022-01-28 17:06:48,459|parser_control.py|deal_requests|line:249|ERROR| 
														
 
															-                            -------------- Details.detail_get error -------------
														
 
															-                            error          dictionary update sequence element #0 has length 1; 2 is required
														
 
															-                            response       None
														
 
															-                            deal request   {'base_info': b'\x80\x04\x95\x92\x04\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03'
														
 
															-              b'_id\x94\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93'
														
 
															-              b'\x94)\x81\x94C\x0ca\xf3\xa1c\x81\xdbV\xa5\x9f\xf9hv\x94b'
														
 
															-              b'\x8c\x05parse\x94\x8c\x0fself.detail_get\x94\x8c\x04item'
														
 
															-              b'\x94}\x94(\x8c\x05title\x94\x8cW\xe5\xb9\xbf\xe4\xb8\x9c'
														
 
															-              b'\xe7\x9c\x81\xe8\x8b\xb1\xe5\xbe\xb7\xe7\x9b\x91'
														
 
															-              b'\xe7\x8b\xb1\xe8\x81\x8c\xe5\xb7\xa5\xe9\xa5\xad'
														
 
															-              b'\xe5\xa0\x82\xe8\xbf\x90\xe8\x90\xa5\xe6\x9c\x8d'
														
 
															-              b'\xe5\x8a\xa1\xe9\x87\x87\xe8\xb4\xad\xe9\xa1\xb9\xe7\x9b\xae('
														
 
															-              b'GZSW21201FG4176A)\xe7\xbb\x93\xe6\x9e\x9c\xe5\x85\xac\xe5\x91'
														
 
															-              b'\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132022-01-28 14:51:5'
														
 
															-              b'6\x94\x8c\nspidercode\x94\x8c\x15gd_gdszfcgwxwz_zbcjgg\x94\x8c'
														
 
															-              b'\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c'
														
 
															-              b'\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4'
														
 
															-              b'\xad\xe7\xbd\x91\x94\x8c\x07channel\x94\x8c\x12\xe4\xb8\xad'
														
 
															-              b'\xe6\xa0\x87\xe6\x88\x90\xe4\xba\xa4\xe5\x85\xac'
														
 
															-              b'\xe5\x91\x8a\x94\x8c\x04area\x94\x8c\x06\xe5\xb9\xbf'
														
 
															-              b'\xe4\xb8\x9c\x94\x8c\x04city\x94\x8c\x00\x94\x8c\x0bcompeteh'
														
 
															-              b'ref\x94N\x8c\x04href\x94\x8c`https://gdgpo.czt.gd.gov.cn/freecm'
														
 
															-              b's/site/gd/ggxx/info/2022/8a7edc7d7e9e62ca017e9f00529a7d80.ht'
														
 
															-              b'ml\x94\x8c\x0bpublishdept\x94h\x18\x8c\tiscompete\x94\x88'
														
 
															-              b'\x8c\x04type\x94h\x18\x8c\x01T\x94\x8c\x07bidding\x94\x8c'
														
 
															-              b'\x10l_np_publishtime\x94h\x18\x8c\ncomeintime\x94h\x18\x8c'
														
 
															-              b'\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\ncomeint'
														
 
															-              b'ime\x94\x8c\x0bcontenthtml\x94h\x18\x8c\x06detail\x94h\x18\x8c'
														
 
															-              b'\x0bprojectinfo\x94Nu\x8c\x0bparser_name\x94\x8c\x07detai'
														
 
															-              b'ls\x94\x8c\x04date\x94\x8c\x132022-01-28 15:55:14\x94\x8c\x0bde'
														
 
															-              b'al_detail\x94]\x94\x8c&//div[@class="info-article in active"]'
														
 
															-              b'\x94a\x8c\x0bcreate_time\x94N\x8c\tparse_url\x94\x8c`https://g'
														
 
															-              b'dgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7edc7d7e'
														
 
															-              b'9e62ca017e9f00529a7d80.html\x94\x8c\x0erequest_params'
														
 
															-              b'\x94}\x94\x8c\x06failed\x94K\x02\x8c\x06author\x94\x8c\x07det'
														
 
															-              b'ails\x94\x8c\x05ex_js\x94h\x18\x8c\tex_python\x94N\x8c\x03pr'
														
 
															-              b'i\x94K\x01\x8c\x07proxies\x94\x89\x8c\x05files\x94}\x94(\x8c\n'
														
 
															-              b'list_xpath\x94\x8c-//div[@class="info-article in active"]//div'
														
 
															-              b'/a\x94\x8c\turl_xpath\x94\x8c\x07./@href\x94\x8c\nname_xpat'
														
 
															-              b'h\x94\x8c\x08./text()\x94\x8c\nfiles_type\x94]\x94(\x8c\x03z'
														
 
															-              b'ip\x94\x8c\x04doxc\x94\x8c\x03ftp\x94\x8c\x03pdf\x94e\x8c'
														
 
															-              b'\x07url_key\x94\x8c\x04http\x94u\x8c\x05error\x94N\x8c\x04code'
														
 
															-              b'\x94K\x00u.',
														
 
															- 'callback': 'detail_get',
														
 
															- 'deal_detail': b'\x80\x04\x95-\x00\x00\x00\x00\x00\x00\x00]\x94\x8c&//div[@cl'
														
 
															-                b'ass="info-article in active"]\x94a.',
														
 
															- 'error_msg': 'ValueError: dictionary update sequence element #0 has length 1; '
														
 
															-              '2 is required',
														
 
															- 'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
														
 
															-           'list_xpath': '//div[@class="info-article in active"]//div/a',
														
 
															-           'name_xpath': './text()',
														
 
															-           'url_key': 'http',
														
 
															-           'url_xpath': './@href'},
														
 
															- 'filter_repeat': False,
														
 
															- 'item': b'\x80\x04\x95+\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
														
 
															-         b'e\x94\x8cW\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c\x81\xe8\x8b\xb1'
														
 
															-         b'\xe5\xbe\xb7\xe7\x9b\x91\xe7\x8b\xb1\xe8\x81\x8c\xe5\xb7\xa5\xe9'
														
 
															-         b'\xa5\xad\xe5\xa0\x82\xe8\xbf\x90\xe8\x90\xa5\xe6\x9c\x8d\xe5\x8a'
														
 
															-         b'\xa1\xe9\x87\x87\xe8\xb4\xad\xe9\xa1\xb9\xe7\x9b\xae(GZSW21201FG417'
														
 
															-         b'6A)\xe7\xbb\x93\xe6\x9e\x9c\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x0bpu'
														
 
															-         b'blishtime\x94\x8c\x132022-01-28 14:51:56\x94\x8c\nspidercode'
														
 
															-         b'\x94\x8c\x15gd_gdszfcgwxwz_zbcjgg\x94\x8c\x04site\x94'
														
 
															-         b'\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c\x81\xe6\x94\xbf\xe5\xba'
														
 
															-         b'\x9c\xe9\x87\x87\xe8\xb4\xad\xe7\xbd\x91\x94\x8c\x07channel'
														
 
															-         b'\x94\x8c\x12\xe4\xb8\xad\xe6\xa0\x87\xe6\x88\x90\xe4\xba\xa4\xe5'
														
 
															-         b'\x85\xac\xe5\x91\x8a\x94\x8c\x04area\x94\x8c\x06\xe5\xb9\xbf\xe4\xb8'
														
 
															-         b'\x9c\x94\x8c\x04city\x94\x8c\x00\x94\x8c\x0bcompetehref\x94N\x8c'
														
 
															-         b'\x04href\x94\x8c`https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx'
														
 
															-         b'/info/2022/8a7edc7d7e9e62ca017e9f00529a7d80.html\x94\x8c\x0bpubli'
														
 
															-         b'shdept\x94h\x0e\x8c\tiscompete\x94\x88\x8c\x04type\x94h\x0e\x8c'
														
 
															-         b'\x01T\x94\x8c\x07bidding\x94\x8c\x10l_np_publishtime\x94h\x0e\x8c\n'
														
 
															-         b'comeintime\x94h\x0e\x8c\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d'
														
 
															-         b'\x94\x8c\ncomeintime\x94\x8c\x0bcontenthtml\x94h\x0e\x8c\x06deta'
														
 
															-         b'il\x94h\x0e\x8c\x0bprojectinfo\x94Nu.',
														
 
															- 'parser_name': 'Details',
														
 
															- 'proxies': False,
														
 
															- 'response': 'None',
														
 
															- 'retry_times': 2,
														
 
															- 'url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7edc7d7e9e62ca017e9f00529a7d80.html'}
														
 
															-                            
														
 
															-Thread-5|2022-01-28 17:06:48,484|request.py|get_response|line:305|DEBUG| 
														
 
															-                -------------- Details.detail_get request for ----------------
														
 
															-                url  = https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7ee7dd7e9e4962017e9f56e40058a5.html
														
 
															-                method = GET
														
 
															-                body = {'files': {'list_xpath': '//div[@class="info-article in active"]//div/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp', 'pdf'], 'url_key': 'http'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36'}}
														
 
															-                
														
 
															-Thread-5|2022-01-28 17:06:48,503|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
														
 
															-Thread-5|2022-01-28 17:06:48,504|parser_control.py|deal_requests|line:249|ERROR| 
														
 
															-                            -------------- Details.detail_get error -------------
														
 
															-                            error          dictionary update sequence element #0 has length 1; 2 is required
														
 
															-                            response       None
														
 
															-                            deal request   {'base_info': b'\x80\x04\x95\x8f\x04\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03'
														
 
															-              b'_id\x94\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93'
														
 
															-              b'\x94)\x81\x94C\x0ca\xf3\xa1c\x81\xdbV\xa5\x9f\xf9hw\x94b'
														
 
															-              b'\x8c\x05parse\x94\x8c\x0fself.detail_get\x94\x8c\x04item'
														
 
															-              b'\x94}\x94(\x8c\x05title\x94\x8cT\xe4\xbd\x9b\xe5\xb1\xb1'
														
 
															-              b'\xe5\xb8\x82\xe9\xa1\xba\xe5\xbe\xb7\xe5\x8c\xba'
														
 
															-              b'\xe4\xba\xba\xe6\xb0\x91\xe6\xb3\x95\xe9\x99\xa2'
														
 
															-              b'\xe4\xbf\xa1\xe6\x81\xaf\xe5\x8c\x96\xe8\xbd\xaf'
														
 
															-              b'\xe7\xa1\xac\xe4\xbb\xb6\xe8\xae\xbe\xe5\xa4\x87'
														
 
															-              b'\xe7\xbb\xb4\xe6\x8a\xa4\xe6\x9c\x8d\xe5\x8a\xa1'
														
 
															-              b'\xe9\xa1\xb9\xe7\x9b\xae\xe7\xbb\x93\xe6\x9e\x9c'
														
 
															-              b'\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132'
														
 
															-              b'022-01-28 14:37:36\x94\x8c\nspidercode\x94\x8c\x15gd_gdszfcg'
														
 
															-              b'wxwz_zbcjgg\x94\x8c\x04site\x94\x8c\x18\xe5\xb9\xbf'
														
 
															-              b'\xe4\xb8\x9c\xe7\x9c\x81\xe6\x94\xbf\xe5\xba\x9c'
														
 
															-              b'\xe9\x87\x87\xe8\xb4\xad\xe7\xbd\x91\x94\x8c\x07channel\x94'
														
 
															-              b'\x8c\x12\xe4\xb8\xad\xe6\xa0\x87\xe6\x88\x90\xe4'
														
 
															-              b'\xba\xa4\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x04area\x94'
														
 
															-              b'\x8c\x06\xe5\xb9\xbf\xe4\xb8\x9c\x94\x8c\x04city\x94'
														
 
															-              b'\x8c\x00\x94\x8c\x0bcompetehref\x94N\x8c\x04href\x94\x8c`https'
														
 
															-              b'://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7ee7'
														
 
															-              b'dd7e9e4962017e9f56e40058a5.html\x94\x8c\x0bpublishdept\x94h\x18'
														
 
															-              b'\x8c\tiscompete\x94\x88\x8c\x04type\x94h\x18\x8c\x01'
														
 
															-              b'T\x94\x8c\x07bidding\x94\x8c\x10l_np_publishtime\x94h'
														
 
															-              b'\x18\x8c\ncomeintime\x94h\x18\x8c\x08sendflag\x94\x8c\x05fal'
														
 
															-              b'se\x94\x8c\x02_d\x94\x8c\ncomeintime\x94\x8c\x0bcontentht'
														
 
															-              b'ml\x94h\x18\x8c\x06detail\x94h\x18\x8c\x0bprojectinfo\x94Nu'
														
 
															-              b'\x8c\x0bparser_name\x94\x8c\x07details\x94\x8c\x04date\x94\x8c'
														
 
															-              b'\x132022-01-28 15:55:14\x94\x8c\x0bdeal_detail\x94]\x94\x8c&/'
														
 
															-              b'/div[@class="info-article in active"]\x94a\x8c\x0bcreate_time'
														
 
															-              b'\x94N\x8c\tparse_url\x94\x8c`https://gdgpo.czt.gd.gov.cn/free'
														
 
															-              b'cms/site/gd/ggxx/info/2022/8a7ee7dd7e9e4962017e9f56e40058a5.'
														
 
															-              b'html\x94\x8c\x0erequest_params\x94}\x94\x8c\x06failed'
														
 
															-              b'\x94K\x02\x8c\x06author\x94\x8c\x07details\x94\x8c\x05ex_j'
														
 
															-              b's\x94h\x18\x8c\tex_python\x94N\x8c\x03pri\x94K\x01\x8c\x07proxi'
														
 
															-              b'es\x94\x89\x8c\x05files\x94}\x94(\x8c\nlist_xpath\x94\x8c-//'
														
 
															-              b'div[@class="info-article in active"]//div/a\x94\x8c\turl_xp'
														
 
															-              b'ath\x94\x8c\x07./@href\x94\x8c\nname_xpath\x94\x8c\x08./text('
														
 
															-              b')\x94\x8c\nfiles_type\x94]\x94(\x8c\x03zip\x94\x8c\x04do'
														
 
															-              b'xc\x94\x8c\x03ftp\x94\x8c\x03pdf\x94e\x8c\x07url_ke'
														
 
															-              b'y\x94\x8c\x04http\x94u\x8c\x05error\x94N\x8c\x04code\x94K\x00'
														
 
															-              b'u.',
														
 
															- 'callback': 'detail_get',
														
 
															- 'deal_detail': b'\x80\x04\x95-\x00\x00\x00\x00\x00\x00\x00]\x94\x8c&//div[@cl'
														
 
															-                b'ass="info-article in active"]\x94a.',
														
 
															- 'error_msg': 'ValueError: dictionary update sequence element #0 has length 1; '
														
 
															-              '2 is required',
														
 
															- 'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
														
 
															-           'list_xpath': '//div[@class="info-article in active"]//div/a',
														
 
															-           'name_xpath': './text()',
														
 
															-           'url_key': 'http',
														
 
															-           'url_xpath': './@href'},
														
 
															- 'filter_repeat': False,
														
 
															- 'item': b'\x80\x04\x95(\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
														
 
															-         b'e\x94\x8cT\xe4\xbd\x9b\xe5\xb1\xb1\xe5\xb8\x82\xe9\xa1\xba'
														
 
															-         b'\xe5\xbe\xb7\xe5\x8c\xba\xe4\xba\xba\xe6\xb0\x91\xe6\xb3\x95\xe9'
														
 
															-         b'\x99\xa2\xe4\xbf\xa1\xe6\x81\xaf\xe5\x8c\x96\xe8\xbd\xaf\xe7\xa1'
														
 
															-         b'\xac\xe4\xbb\xb6\xe8\xae\xbe\xe5\xa4\x87\xe7\xbb\xb4\xe6\x8a\xa4'
														
 
															-         b'\xe6\x9c\x8d\xe5\x8a\xa1\xe9\xa1\xb9\xe7\x9b\xae\xe7\xbb\x93\xe6'
														
 
															-         b'\x9e\x9c\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c'
														
 
															-         b'\x132022-01-28 14:37:36\x94\x8c\nspidercode\x94\x8c\x15gd_gdszfcgwx'
														
 
															-         b'wz_zbcjgg\x94\x8c\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8'
														
 
															-         b'\x9c\xe7\x9c\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4\xad'
														
 
															-         b'\xe7\xbd\x91\x94\x8c\x07channel\x94\x8c\x12\xe4\xb8\xad\xe6'
														
 
															-         b'\xa0\x87\xe6\x88\x90\xe4\xba\xa4\xe5\x85\xac\xe5\x91\x8a\x94\x8c'
														
 
															-         b'\x04area\x94\x8c\x06\xe5\xb9\xbf\xe4\xb8\x9c\x94\x8c\x04cit'
														
 
															-         b'y\x94\x8c\x00\x94\x8c\x0bcompetehref\x94N\x8c\x04href\x94\x8c`htt'
														
 
															-         b'ps://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7ee7dd7e9e'
														
 
															-         b'4962017e9f56e40058a5.html\x94\x8c\x0bpublishdept\x94h\x0e\x8c\tisco'
														
 
															-         b'mpete\x94\x88\x8c\x04type\x94h\x0e\x8c\x01T\x94\x8c\x07biddin'
														
 
															-         b'g\x94\x8c\x10l_np_publishtime\x94h\x0e\x8c\ncomeintime\x94'
														
 
															-         b'h\x0e\x8c\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\ncome'
														
 
															-         b'intime\x94\x8c\x0bcontenthtml\x94h\x0e\x8c\x06detail\x94'
														
 
															-         b'h\x0e\x8c\x0bprojectinfo\x94Nu.',
														
 
															- 'parser_name': 'Details',
														
 
															- 'proxies': False,
														
 
															- 'response': 'None',
														
 
															- 'retry_times': 2,
														
 
															- 'url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7ee7dd7e9e4962017e9f56e40058a5.html'}
														
 
															-                            
														
 
															-Thread-5|2022-01-28 17:06:48,552|request.py|get_response|line:305|DEBUG| 
														
 
															-                -------------- Details.detail_get request for ----------------
														
 
															-                url  = https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7efa517e9032e5017e9b37b0c50534.html
														
 
															-                method = GET
														
 
															-                body = {'files': {'list_xpath': '//div[@class="info-article in active"]//div/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp', 'pdf'], 'url_key': 'http'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36'}}
														
 
															-                
														
 
															-Thread-5|2022-01-28 17:06:48,564|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
														
 
															-Thread-5|2022-01-28 17:06:48,565|parser_control.py|deal_requests|line:249|ERROR| 
														
 
															-                            -------------- Details.detail_get error -------------
														
 
															-                            error          dictionary update sequence element #0 has length 1; 2 is required
														
 
															-                            response       None
														
 
															-                            deal request   {'base_info': b'\x80\x04\x95\x8d\x04\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03'
														
 
															-              b'_id\x94\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93'
														
 
															-              b'\x94)\x81\x94C\x0ca\xf2\x95\xb9{\xdc<\xbf\xf2)V\xe6\x94b'
														
 
															-              b'\x8c\x05parse\x94\x8c\x0fself.detail_get\x94\x8c\x04item'
														
 
															-              b'\x94}\x94(\x8c\x05title\x94\x8cZ\xe5\xb9\xbf\xe4\xb8\x9c'
														
 
															-              b'\xe7\x9c\x81\xe4\xba\xba\xe5\x8a\x9b\xe8\xb5\x84'
														
 
															-              b'\xe6\xba\x90\xe5\xb8\x82\xe5\x9c\xba\xe8\xae\xbe'
														
 
															-              b'\xe6\x96\xbd\xe8\xae\xbe\xe5\xa4\x87\xe8\xb4\xad'
														
 
															-              b'\xe7\xbd\xae\xe9\x9b\x86\xe6\x88\x90\xe5\x8f\x8a'
														
 
															-              b'\xe5\xb1\x95\xe9\x99\x88\xe5\xb8\x83\xe7\xbd\xae'
														
 
															-              b'\xe6\x9c\x8d\xe5\x8a\xa1\xe9\xa1\xb9\xe7\x9b\xae'
														
 
															-              b'\xe6\x8b\x9b\xe6\xa0\x87\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x0bp'
														
 
															-              b'ublishtime\x94\x8c\x132022-01-27 19:14:45\x94\x8c\nspidercod'
														
 
															-              b'e\x94\x8c\x13gd_gdszfcgwxwz_cggg\x94\x8c\x04site\x94\x8c'
														
 
															-              b'\x18\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c\x81\xe6\x94'
														
 
															-              b'\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4\xad\xe7\xbd'
														
 
															-              b'\x91\x94\x8c\x07channel\x94\x8c\x0c\xe9\x87\x87\xe8\xb4\xad'
														
 
															-              b'\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x04area\x94\x8c\x06'
														
 
															-              b'\xe5\xb9\xbf\xe4\xb8\x9c\x94\x8c\x04city\x94\x8c\x00'
														
 
															-              b'\x94\x8c\x0bcompetehref\x94N\x8c\x04href\x94\x8c`https://gdg'
														
 
															-              b'po.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7efa517e90'
														
 
															-              b'32e5017e9b37b0c50534.html\x94\x8c\x0bpublishdept\x94h\x18\x8c\t'
														
 
															-              b'iscompete\x94\x88\x8c\x04type\x94h\x18\x8c\x01T\x94\x8c\x07bi'
														
 
															-              b'dding\x94\x8c\x10l_np_publishtime\x94h\x18\x8c\ncomeintime\x94'
														
 
															-              b'h\x18\x8c\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\n'
														
 
															-              b'comeintime\x94\x8c\x0bcontenthtml\x94h\x18\x8c\x06detail\x94'
														
 
															-              b'h\x18\x8c\x0bprojectinfo\x94Nu\x8c\x0bparser_name\x94\x8c\x07de'
														
 
															-              b'tails\x94\x8c\x04date\x94\x8c\x132022-01-27 20:53:12\x94\x8c'
														
 
															-              b'\x0bdeal_detail\x94]\x94\x8c&//div[@class="info-article in a'
														
 
															-              b'ctive"]\x94a\x8c\x0bcreate_time\x94N\x8c\tparse_url\x94\x8c`ht'
														
 
															-              b'tps://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7'
														
 
															-              b'efa517e9032e5017e9b37b0c50534.html\x94\x8c\x0erequest_par'
														
 
															-              b'ams\x94}\x94\x8c\x06failed\x94K\x13\x8c\x06author\x94\x8c\x07'
														
 
															-              b'details\x94\x8c\x05ex_js\x94h\x18\x8c\tex_python\x94N\x8c'
														
 
															-              b'\x03pri\x94K\x01\x8c\x07proxies\x94\x89\x8c\x05files\x94}\x94'
														
 
															-              b'(\x8c\nlist_xpath\x94\x8c-//div[@class="info-article in active'
														
 
															-              b'"]//div/a\x94\x8c\turl_xpath\x94\x8c\x07./@href\x94\x8c\nname_x'
														
 
															-              b'path\x94\x8c\x08./text()\x94\x8c\nfiles_type\x94]\x94('
														
 
															-              b'\x8c\x03zip\x94\x8c\x04doxc\x94\x8c\x03ftp\x94\x8c\x03pdf'
														
 
															-              b'\x94e\x8c\x07url_key\x94\x8c\x04http\x94u\x8c\x05error\x94'
														
 
															-              b'N\x8c\x04code\x94K\x00u.',
														
 
															- 'callback': 'detail_get',
														
 
															- 'deal_detail': b'\x80\x04\x95-\x00\x00\x00\x00\x00\x00\x00]\x94\x8c&//div[@cl'
														
 
															-                b'ass="info-article in active"]\x94a.',
														
 
															- 'error_msg': 'ValueError: dictionary update sequence element #0 has length 1; '
														
 
															-              '2 is required',
														
 
															- 'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
														
 
															-           'list_xpath': '//div[@class="info-article in active"]//div/a',
														
 
															-           'name_xpath': './text()',
														
 
															-           'url_key': 'http',
														
 
															-           'url_xpath': './@href'},
														
 
															- 'filter_repeat': False,
														
 
															- 'item': b'\x80\x04\x95&\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
														
 
															-         b'e\x94\x8cZ\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c\x81\xe4\xba\xba'
														
 
															-         b'\xe5\x8a\x9b\xe8\xb5\x84\xe6\xba\x90\xe5\xb8\x82\xe5\x9c\xba\xe8'
														
 
															-         b'\xae\xbe\xe6\x96\xbd\xe8\xae\xbe\xe5\xa4\x87\xe8\xb4\xad\xe7\xbd'
														
 
															-         b'\xae\xe9\x9b\x86\xe6\x88\x90\xe5\x8f\x8a\xe5\xb1\x95\xe9\x99\x88'
														
 
															-         b'\xe5\xb8\x83\xe7\xbd\xae\xe6\x9c\x8d\xe5\x8a\xa1\xe9\xa1\xb9\xe7'
														
 
															-         b'\x9b\xae\xe6\x8b\x9b\xe6\xa0\x87\xe5\x85\xac\xe5\x91\x8a\x94\x8c'
														
 
															-         b'\x0bpublishtime\x94\x8c\x132022-01-27 19:14:45\x94\x8c\nspiderc'
														
 
															-         b'ode\x94\x8c\x13gd_gdszfcgwxwz_cggg\x94\x8c\x04site\x94\x8c\x18\xe5'
														
 
															-         b'\xb9\xbf\xe4\xb8\x9c\xe7\x9c\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87'
														
 
															-         b'\x87\xe8\xb4\xad\xe7\xbd\x91\x94\x8c\x07channel\x94\x8c\x0c'
														
 
															-         b'\xe9\x87\x87\xe8\xb4\xad\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x04area\x94'
														
 
															-         b'\x8c\x06\xe5\xb9\xbf\xe4\xb8\x9c\x94\x8c\x04city\x94\x8c\x00\x94\x8c'
														
 
															-         b'\x0bcompetehref\x94N\x8c\x04href\x94\x8c`https://gdgpo.czt.gd.gov.'
														
 
															-         b'cn/freecms/site/gd/ggxx/info/2022/8a7efa517e9032e5017e9b37b0c50534.h'
														
 
															-         b'tml\x94\x8c\x0bpublishdept\x94h\x0e\x8c\tiscompete\x94\x88\x8c\x04t'
														
 
															-         b'ype\x94h\x0e\x8c\x01T\x94\x8c\x07bidding\x94\x8c\x10l_np_publishti'
														
 
															-         b'me\x94h\x0e\x8c\ncomeintime\x94h\x0e\x8c\x08sendflag\x94\x8c\x05fal'
														
 
															-         b'se\x94\x8c\x02_d\x94\x8c\ncomeintime\x94\x8c\x0bcontenthtml\x94h'
														
 
															-         b'\x0e\x8c\x06detail\x94h\x0e\x8c\x0bprojectinfo\x94Nu.',
														
 
															- 'parser_name': 'Details',
														
 
															- 'proxies': False,
														
 
															- 'response': 'None',
														
 
															- 'retry_times': 2,
														
 
															- 'url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7efa517e9032e5017e9b37b0c50534.html'}
														
 
															-                            
														
 
															-Thread-5|2022-01-28 17:06:48,615|request.py|get_response|line:305|DEBUG| 
														
 
															-                -------------- Details.detail_get request for ----------------
														
 
															-                url  = https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/9b5da8cc-1545-438c-8ad6-ccdd8bd71b10.html
														
 
															-                method = GET
														
 
															-                body = {'files': {'list_xpath': '//div[@class="info-article in active"]//div/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp', 'pdf'], 'url_key': 'http'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'}}
														
 
															-                
														
 
															-Thread-5|2022-01-28 17:06:48,628|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
														
 
															-Thread-5|2022-01-28 17:06:48,628|parser_control.py|deal_requests|line:249|ERROR| 
														
 
															-                            -------------- Details.detail_get error -------------
														
 
															-                            error          dictionary update sequence element #0 has length 1; 2 is required
														
 
															-                            response       None
														
 
															-                            deal request   {'base_info': b'\x80\x04\x95\x86\x04\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03'
														
 
															-              b'_id\x94\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93'
														
 
															-              b'\x94)\x81\x94C\x0ca\xf3\xa1\\\x81\xdbV\xa5\x9f\xf9hU\x94b'
														
 
															-              b'\x8c\x05parse\x94\x8c\x0fself.detail_get\x94\x8c\x04item'
														
 
															-              b'\x94}\x94(\x8c\x05title\x94\x8cE\xe9\x9f\xb6\xe5\x85\xb3'
														
 
															-              b'\xe5\xb8\x82\xe6\xad\xa6\xe6\xb1\x9f\xe5\x8c\xba'
														
 
															-              b'\xe4\xba\xba\xe6\xb0\x91\xe6\xa3\x80\xe5\xaf\x9f'
														
 
															-              b'\xe9\x99\xa2\xe7\x94\xb5\xe5\xad\x90\xe5\x8d\x96'
														
 
															-              b'\xe5\x9c\xba\xe7\x9b\xb4\xe6\x8e\xa5\xe8\xae\xa2'
														
 
															-              b'\xe8\xb4\xad\xe6\x88\x90\xe4\xba\xa4\xe5\x85\xac'
														
 
															-              b'\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132022-01-28 15:48'
														
 
															-              b':52\x94\x8c\nspidercode\x94\x8c\x13gd_gdszfcgwxwz_ysgg\x94\x8c'
														
 
															-              b'\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c'
														
 
															-              b'\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4'
														
 
															-              b'\xad\xe7\xbd\x91\x94\x8c\x07channel\x94\x8c\x12\xe7\x94\xb5'
														
 
															-              b'\xe5\xad\x90\xe5\x8d\x96\xe5\x9c\xba\xe4\xbf\xa1'
														
 
															-              b'\xe6\x81\xaf\x94\x8c\x04area\x94\x8c\x06\xe5\xb9\xbf'
														
 
															-              b'\xe4\xb8\x9c\x94\x8c\x04city\x94\x8c\x00\x94\x8c\x0bcompeteh'
														
 
															-              b'ref\x94N\x8c\x04href\x94\x8cdhttps://gdgpo.czt.gd.gov.cn/freecm'
														
 
															-              b's/site/gd/ggxx/info/2022/9b5da8cc-1545-438c-8ad6-ccdd8bd71b1'
														
 
															-              b'0.html\x94\x8c\x0bpublishdept\x94h\x18\x8c\tiscompete\x94\x88'
														
 
															-              b'\x8c\x04type\x94h\x18\x8c\x01T\x94\x8c\x07bidding\x94\x8c'
														
 
															-              b'\x10l_np_publishtime\x94h\x18\x8c\ncomeintime\x94h\x18\x8c'
														
 
															-              b'\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\ncomeint'
														
 
															-              b'ime\x94\x8c\x0bcontenthtml\x94h\x18\x8c\x06detail\x94h\x18\x8c'
														
 
															-              b'\x0bprojectinfo\x94Nu\x8c\x0bparser_name\x94\x8c\x07detai'
														
 
															-              b'ls\x94\x8c\x04date\x94\x8c\x132022-01-28 15:55:07\x94\x8c\x0bde'
														
 
															-              b'al_detail\x94]\x94\x8c&//div[@class="info-article in active"]'
														
 
															-              b'\x94a\x8c\x0bcreate_time\x94N\x8c\tparse_url\x94\x8cdhttps://g'
														
 
															-              b'dgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/9b5da8cc-1'
														
 
															-              b'545-438c-8ad6-ccdd8bd71b10.html\x94\x8c\x0erequest_params'
														
 
															-              b'\x94}\x94\x8c\x06failed\x94K\x02\x8c\x06author\x94\x8c\x07det'
														
 
															-              b'ails\x94\x8c\x05ex_js\x94h\x18\x8c\tex_python\x94N\x8c\x03pr'
														
 
															-              b'i\x94K\x01\x8c\x07proxies\x94\x89\x8c\x05files\x94}\x94(\x8c\n'
														
 
															-              b'list_xpath\x94\x8c-//div[@class="info-article in active"]//div'
														
 
															-              b'/a\x94\x8c\turl_xpath\x94\x8c\x07./@href\x94\x8c\nname_xpat'
														
 
															-              b'h\x94\x8c\x08./text()\x94\x8c\nfiles_type\x94]\x94(\x8c\x03z'
														
 
															-              b'ip\x94\x8c\x04doxc\x94\x8c\x03ftp\x94\x8c\x03pdf\x94e\x8c'
														
 
															-              b'\x07url_key\x94\x8c\x04http\x94u\x8c\x05error\x94N\x8c\x04code'
														
 
															-              b'\x94K\x00u.',
														
 
															- 'callback': 'detail_get',
														
 
															- 'deal_detail': b'\x80\x04\x95-\x00\x00\x00\x00\x00\x00\x00]\x94\x8c&//div[@cl'
														
 
															-                b'ass="info-article in active"]\x94a.',
														
 
															- 'error_msg': 'ValueError: dictionary update sequence element #0 has length 1; '
														
 
															-              '2 is required',
														
 
															- 'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
														
 
															-           'list_xpath': '//div[@class="info-article in active"]//div/a',
														
 
															-           'name_xpath': './text()',
														
 
															-           'url_key': 'http',
														
 
															-           'url_xpath': './@href'},
														
 
															- 'filter_repeat': False,
														
 
															- 'item': b'\x80\x04\x95\x1b\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
														
 
															-         b'e\x94\x8cE\xe9\x9f\xb6\xe5\x85\xb3\xe5\xb8\x82\xe6\xad\xa6'
														
 
															-         b'\xe6\xb1\x9f\xe5\x8c\xba\xe4\xba\xba\xe6\xb0\x91\xe6\xa3\x80\xe5'
														
 
															-         b'\xaf\x9f\xe9\x99\xa2\xe7\x94\xb5\xe5\xad\x90\xe5\x8d\x96\xe5\x9c'
														
 
															-         b'\xba\xe7\x9b\xb4\xe6\x8e\xa5\xe8\xae\xa2\xe8\xb4\xad\xe6\x88\x90'
														
 
															-         b'\xe4\xba\xa4\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94'
														
 
															-         b'\x8c\x132022-01-28 15:48:52\x94\x8c\nspidercode\x94\x8c\x13gd_gdsz'
														
 
															-         b'fcgwxwz_ysgg\x94\x8c\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c'
														
 
															-         b'\xe7\x9c\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4\xad\xe7'
														
 
															-         b'\xbd\x91\x94\x8c\x07channel\x94\x8c\x12\xe7\x94\xb5\xe5\xad'
														
 
															-         b'\x90\xe5\x8d\x96\xe5\x9c\xba\xe4\xbf\xa1\xe6\x81\xaf\x94\x8c\x04area'
														
 
															-         b'\x94\x8c\x06\xe5\xb9\xbf\xe4\xb8\x9c\x94\x8c\x04city\x94\x8c\x00\x94'
														
 
															-         b'\x8c\x0bcompetehref\x94N\x8c\x04href\x94\x8cdhttps://gdgpo.czt.gd'
														
 
															-         b'.gov.cn/freecms/site/gd/ggxx/info/2022/9b5da8cc-1545-438c-8ad6-ccdd8'
														
 
															-         b'bd71b10.html\x94\x8c\x0bpublishdept\x94h\x0e\x8c\tiscompete'
														
 
															-         b'\x94\x88\x8c\x04type\x94h\x0e\x8c\x01T\x94\x8c\x07bidding'
														
 
															-         b'\x94\x8c\x10l_np_publishtime\x94h\x0e\x8c\ncomeintime\x94h'
														
 
															-         b'\x0e\x8c\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\ncomei'
														
 
															-         b'ntime\x94\x8c\x0bcontenthtml\x94h\x0e\x8c\x06detail\x94h'
														
 
															-         b'\x0e\x8c\x0bprojectinfo\x94Nu.',
														
 
															- 'parser_name': 'Details',
														
 
															- 'proxies': False,
														
 
															- 'response': 'None',
														
 
															- 'retry_times': 2,
														
 
															- 'url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/9b5da8cc-1545-438c-8ad6-ccdd8bd71b10.html'}
														
 
															-                            
														
 
															-Thread-5|2022-01-28 17:06:48,677|request.py|get_response|line:305|DEBUG| 
														
 
															-                -------------- Details.detail_get request for ----------------
														
 
															-                url  = https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/ab73c655-102a-4923-b4cb-dadfdc82c913.html
														
 
															-                method = GET
														
 
															-                body = {'files': {'list_xpath': '//div[@class="info-article in active"]//div/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp', 'pdf'], 'url_key': 'http'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36'}}
														
 
															-                
														
 
															-Thread-5|2022-01-28 17:06:48,691|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
														
 
															-Thread-5|2022-01-28 17:06:48,691|parser_control.py|deal_requests|line:249|ERROR| 
														
 
															-                            -------------- Details.detail_get error -------------
														
 
															-                            error          dictionary update sequence element #0 has length 1; 2 is required
														
 
															-                            response       None
														
 
															-                            deal request   {'base_info': b'\x80\x04\x95\x86\x04\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03'
														
 
															-              b'_id\x94\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93'
														
 
															-              b'\x94)\x81\x94C\x0ca\xf3Z\x94J\xa3\xe2Z\x12\xe9\t\x00\x94b'
														
 
															-              b'\x8c\x05parse\x94\x8c\x0fself.detail_get\x94\x8c\x04item'
														
 
															-              b'\x94}\x94(\x8c\x05title\x94\x8cE\xe5\xb9\xbf\xe4\xb8\x9c'
														
 
															-              b'\xe7\x9c\x81\xe4\xbd\x9b\xe5\xb1\xb1\xe8\x88\xaa'
														
 
															-              b'\xe9\x81\x93\xe4\xba\x8b\xe5\x8a\xa1\xe4\xb8\xad'
														
 
															-              b'\xe5\xbf\x83\xe7\x94\xb5\xe5\xad\x90\xe5\x8d\x96'
														
 
															-              b'\xe5\x9c\xba\xe7\x9b\xb4\xe6\x8e\xa5\xe8\xae\xa2'
														
 
															-              b'\xe8\xb4\xad\xe6\x88\x90\xe4\xba\xa4\xe5\x85\xac'
														
 
															-              b'\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132022-01-28 10:00'
														
 
															-              b':38\x94\x8c\nspidercode\x94\x8c\x13gd_gdszfcgwxwz_ysgg\x94\x8c'
														
 
															-              b'\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c'
														
 
															-              b'\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4'
														
 
															-              b'\xad\xe7\xbd\x91\x94\x8c\x07channel\x94\x8c\x12\xe7\x94\xb5'
														
 
															-              b'\xe5\xad\x90\xe5\x8d\x96\xe5\x9c\xba\xe4\xbf\xa1'
														
 
															-              b'\xe6\x81\xaf\x94\x8c\x04area\x94\x8c\x06\xe5\xb9\xbf'
														
 
															-              b'\xe4\xb8\x9c\x94\x8c\x04city\x94\x8c\x00\x94\x8c\x0bcompeteh'
														
 
															-              b'ref\x94N\x8c\x04href\x94\x8cdhttps://gdgpo.czt.gd.gov.cn/freecm'
														
 
															-              b's/site/gd/ggxx/info/2022/ab73c655-102a-4923-b4cb-dadfdc82c91'
														
 
															-              b'3.html\x94\x8c\x0bpublishdept\x94h\x18\x8c\tiscompete\x94\x88'
														
 
															-              b'\x8c\x04type\x94h\x18\x8c\x01T\x94\x8c\x07bidding\x94\x8c'
														
 
															-              b'\x10l_np_publishtime\x94h\x18\x8c\ncomeintime\x94h\x18\x8c'
														
 
															-              b'\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\ncomeint'
														
 
															-              b'ime\x94\x8c\x0bcontenthtml\x94h\x18\x8c\x06detail\x94h\x18\x8c'
														
 
															-              b'\x0bprojectinfo\x94Nu\x8c\x0bparser_name\x94\x8c\x07detai'
														
 
															-              b'ls\x94\x8c\x04date\x94\x8c\x132022-01-28 10:53:07\x94\x8c\x0bde'
														
 
															-              b'al_detail\x94]\x94\x8c&//div[@class="info-article in active"]'
														
 
															-              b'\x94a\x8c\x0bcreate_time\x94N\x8c\tparse_url\x94\x8cdhttps://g'
														
 
															-              b'dgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/ab73c655-1'
														
 
															-              b'02a-4923-b4cb-dadfdc82c913.html\x94\x8c\x0erequest_params'
														
 
															-              b'\x94}\x94\x8c\x06failed\x94K\x0b\x8c\x06author\x94\x8c\x07det'
														
 
															-              b'ails\x94\x8c\x05ex_js\x94h\x18\x8c\tex_python\x94N\x8c\x03pr'
														
 
															-              b'i\x94K\x01\x8c\x07proxies\x94\x89\x8c\x05files\x94}\x94(\x8c\n'
														
 
															-              b'list_xpath\x94\x8c-//div[@class="info-article in active"]//div'
														
 
															-              b'/a\x94\x8c\turl_xpath\x94\x8c\x07./@href\x94\x8c\nname_xpat'
														
 
															-              b'h\x94\x8c\x08./text()\x94\x8c\nfiles_type\x94]\x94(\x8c\x03z'
														
 
															-              b'ip\x94\x8c\x04doxc\x94\x8c\x03ftp\x94\x8c\x03pdf\x94e\x8c'
														
 
															-              b'\x07url_key\x94\x8c\x04http\x94u\x8c\x05error\x94N\x8c\x04code'
														
 
															-              b'\x94K\x00u.',
														
 
															- 'callback': 'detail_get',
														
 
															- 'deal_detail': b'\x80\x04\x95-\x00\x00\x00\x00\x00\x00\x00]\x94\x8c&//div[@cl'
														
 
															-                b'ass="info-article in active"]\x94a.',
														
 
															- 'error_msg': 'ValueError: dictionary update sequence element #0 has length 1; '
														
 
															-              '2 is required',
														
 
															- 'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
														
 
															-           'list_xpath': '//div[@class="info-article in active"]//div/a',
														
 
															-           'name_xpath': './text()',
														
 
															-           'url_key': 'http',
														
 
															-           'url_xpath': './@href'},
														
 
															- 'filter_repeat': False,
														
 
															- 'item': b'\x80\x04\x95\x1b\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
														
 
															-         b'e\x94\x8cE\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c\x81\xe4\xbd\x9b'
														
 
															-         b'\xe5\xb1\xb1\xe8\x88\xaa\xe9\x81\x93\xe4\xba\x8b\xe5\x8a\xa1\xe4'
														
 
															-         b'\xb8\xad\xe5\xbf\x83\xe7\x94\xb5\xe5\xad\x90\xe5\x8d\x96\xe5\x9c'
														
 
															-         b'\xba\xe7\x9b\xb4\xe6\x8e\xa5\xe8\xae\xa2\xe8\xb4\xad\xe6\x88\x90'
														
 
															-         b'\xe4\xba\xa4\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94'
														
 
															-         b'\x8c\x132022-01-28 10:00:38\x94\x8c\nspidercode\x94\x8c\x13gd_gdsz'
														
 
															-         b'fcgwxwz_ysgg\x94\x8c\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c'
														
 
															-         b'\xe7\x9c\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4\xad\xe7'
														
 
															-         b'\xbd\x91\x94\x8c\x07channel\x94\x8c\x12\xe7\x94\xb5\xe5\xad'
														
 
															-         b'\x90\xe5\x8d\x96\xe5\x9c\xba\xe4\xbf\xa1\xe6\x81\xaf\x94\x8c\x04area'
														
 
															-         b'\x94\x8c\x06\xe5\xb9\xbf\xe4\xb8\x9c\x94\x8c\x04city\x94\x8c\x00\x94'
														
 
															-         b'\x8c\x0bcompetehref\x94N\x8c\x04href\x94\x8cdhttps://gdgpo.czt.gd'
														
 
															-         b'.gov.cn/freecms/site/gd/ggxx/info/2022/ab73c655-102a-4923-b4cb-dadfd'
														
 
															-         b'c82c913.html\x94\x8c\x0bpublishdept\x94h\x0e\x8c\tiscompete'
														
 
															-         b'\x94\x88\x8c\x04type\x94h\x0e\x8c\x01T\x94\x8c\x07bidding'
														
 
															-         b'\x94\x8c\x10l_np_publishtime\x94h\x0e\x8c\ncomeintime\x94h'
														
 
															-         b'\x0e\x8c\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\ncomei'
														
 
															-         b'ntime\x94\x8c\x0bcontenthtml\x94h\x0e\x8c\x06detail\x94h'
														
 
															-         b'\x0e\x8c\x0bprojectinfo\x94Nu.',
														
 
															- 'parser_name': 'Details',
														
 
															- 'proxies': False,
														
 
															- 'response': 'None',
														
 
															- 'retry_times': 2,
														
 
															- 'url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/ab73c655-102a-4923-b4cb-dadfdc82c913.html'}
														
 
															-                            
														
 
															-Thread-5|2022-01-28 17:06:48,741|parser_control.py|run|line:56|DEBUG| parser 等待任务...
														
 
															-Thread-3|2022-01-28 17:06:49,216|tools.py|dumps_json|line:843|ERROR| Object of type ObjectId is not JSON serializable
														
 
															-Thread-3|2022-01-28 17:06:49,222|item_buffer.py|__add_item_to_db|line:300|DEBUG| 
														
 
															-                -------------- item 批量入库 --------------
														
 
															-                表名: mgp_list
														
 
															-                datas: [{'_id': ObjectId('61f361ae9547b8b7d10dc034'),
														
 
															-  'author': None,
														
 
															-  'code': 0,
														
 
															-  'create_time': None,
														
 
															-  'date': '2022-01-28 11:23:26',
														
 
															-  'deal_detail': ['//div[@class="xl_main"]', '//div[@class="big-box-B"]'],
														
 
															-  'error': None,
														
 
															-  'ex_js': '',
														
 
															-  'ex_python': None,
														
 
															-  'failed': 5,
														
 
															-  'files': {'file_type': 'doxc',
														
 
															-            'files_type': ['zip', 'doxc', 'ftp'],
														
 
															-            'host': 'http://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c',
														
 
															-            'list_xpath': '//div[@id="fjxz"]/p[@class="mar-L30 '
														
 
															-                          'fjwz"]/a|//div[@id="result"]//u/a|//div[@class="TRS_Editor"]//p/a',
														
 
															-            'name_xpath': './text()',
														
 
															-            'url_key': 'http',
														
 
															-            'url_xpath': './@href'},
														
 
															-  'item': {'T': 'bidding',
														
 
															-           '_d': 'comeintime',
														
 
															-           'area': '福建',
														
 
															-           'channel': '通知公告',
														
 
															-           'city': '',
														
 
															-           'comeintime': '',
														
 
															-           'competehref': None,
														
 
															-           'contenthtml': '',
														
 
															-           'detail': '',
														
 
															-           'href': 'http://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c/',
														
 
															-           'iscompete': True,
														
 
															-           'l_np_publishtime': '',
														
 
															-           'projectinfo': None,
														
 
															-           'publishdept': '',
														
 
															-           'publishtime': '2019-07-17 16:14:02',
														
 
															-           'sendflag': 'false',
														
 
															-           'site': '福建省民政厅',
														
 
															-           'spidercode': 'fj_fjsmzt_tzgg',
														
 
															-           'title': '乡镇敬老院床位使用率达标县（市、区）第三方评估采购项目采购公告',
														
 
															-           'type': ''},
														
 
															-  'parse': 'self.detail_get',
														
 
															-  'parse_url': 'http://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c/',
														
 
															-  'parser_name': 'details',
														
 
															-  'pri': 1,
														
 
															-  'proxies': False,
														
 
															-  'request_params': {}},
														
 
															- {'_id': ObjectId('61f361ae9547b8b7d10dc042'),
														
 
															-  'author': None,
														
 
															-  'code': 0,
														
 
															-  'create_time': None,
														
 
															-  'date': '2022-01-28 11:23:26',
														
 
															-  'deal_detail': ['//div[@class="xl_main"]', '//div[@class="big-box-B"]'],
														
 
															-  'error': None,
														
 
															-  'ex_js': '',
														
 
															-  'ex_python': None,
														
 
															-  'failed': 5,
														
 
															-  'files': {'file_type': 'doxc',
														
 
															-            'files_type': ['zip', 'doxc', 'ftp'],
														
 
															-            'host': 'http://cz.fjzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8',
														
 
															-            'list_xpath': '//div[@id="fjxz"]/p[@class="mar-L30 '
														
 
															-                          'fjwz"]/a|//div[@id="result"]//u/a|//div[@class="TRS_Editor"]//p/a',
														
 
															-            'name_xpath': './text()',
														
 
															-            'url_key': 'http',
														
 
															-            'url_xpath': './@href'},
														
 
															-  'item': {'T': 'bidding',
														
 
															-           '_d': 'comeintime',
														
 
															-           'area': '福建',
														
 
															-           'channel': '通知公告',
														
 
															-           'city': '',
														
 
															-           'comeintime': '',
														
 
															-           'competehref': None,
														
 
															-           'contenthtml': '',
														
 
															-           'detail': '',
														
 
															-           'href': 'http://cz.fjzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8/',
														
 
															-           'iscompete': True,
														
 
															-           'l_np_publishtime': '',
														
 
															-           'projectinfo': None,
														
 
															-           'publishdept': '',
														
 
															-           'publishtime': '2019-05-22 16:01:08',
														
 
															-           'sendflag': 'false',
														
 
															-           'site': '福建省民政厅',
														
 
															-           'spidercode': 'fj_fjsmzt_tzgg',
														
 
															-           'title': '福建省养老服务综合信息平台采购项目招标公告',
														
 
															-           'type': ''},
														
 
															-  'parse': 'self.detail_get',
														
 
															-  'parse_url': 'http://cz.fjzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8/',
														
 
															-  'parser_name': 'details',
														
 
															-  'pri': 1,
														
 
															-  'proxies': False,
														
 
															-  'request_params': {}},
														
 
															- {'_id': ObjectId('61f3a16181db56a59ff96871'),
														
 
															-  'author': None,
														
 
															-  'code': 0,
														
 
															-  'create_time': None,
														
 
															-  'date': '2022-01-28 15:55:12',
														
 
															-  'deal_detail': ['//div[@class="info-article in active"]'],
														
 
															-  'error': None,
														
 
															-  'ex_js': '',
														
 
															-  'ex_python': None,
														
 
															-  'failed': 3,
														
 
															-  'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
														
 
															-            'list_xpath': '//div[@class="info-article in active"]//div/a',
														
 
															-            'name_xpath': './text()',
														
 
															-            'url_key': 'http',
														
 
															-            'url_xpath': './@href'},
														
 
															-  'item': {'T': 'bidding',
														
 
															-           '_d': 'comeintime',
														
 
															-           'area': '广东',
														
 
															-           'channel': '采购公告',
														
 
															-           'city': '',
														
 
															-           'comeintime': '',
														
 
															-           'competehref': None,
														
 
															-           'contenthtml': '',
														
 
															-           'detail': '',
														
 
															-           'href': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7e3a9c7e946b44017e9f51af707454.html',
														
 
															-           'iscompete': True,
														
 
															-           'l_np_publishtime': '',
														
 
															-           'projectinfo': None,
														
 
															-           'publishdept': '',
														
 
															-           'publishtime': '2022-01-28 15:09:43',
														
 
															-           'sendflag': 'false',
														
 
															-           'site': '广东省政府采购网',
														
 
															-           'spidercode': 'gd_gdszfcgwxwz_cggg',
														
 
															-           'title': '广东轻工职业技术学院新能源汽车检测实训设备购置招标公告',
														
 
															-           'type': ''},
														
 
															-  'parse': 'self.detail_get',
														
 
															-  'parse_url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7e3a9c7e946b44017e9f51af707454.html',
														
 
															-  'parser_name': 'details',
														
 
															-  'pri': 1,
														
 
															-  'proxies': False,
														
 
															-  'request_params': {}},
														
 
															- {'_id': ObjectId('61f3a16381db56a59ff96876'),
														
 
															-  'author': None,
														
 
															-  'code': 0,
														
 
															-  'create_time': None,
														
 
															-  'date': '2022-01-28 15:55:14',
														
 
															-  'deal_detail': ['//div[@class="info-article in active"]'],
														
 
															-  'error': None,
														
 
															-  'ex_js': '',
														
 
															-  'ex_python': None,
														
 
															-  'failed': 3,
														
 
															-  'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
														
 
															-            'list_xpath': '//div[@class="info-article in active"]//div/a',
														
 
															-            'name_xpath': './text()',
														
 
															-            'url_key': 'http',
														
 
															-            'url_xpath': './@href'},
														
 
															-  'item': {'T': 'bidding',
														
 
															-           '_d': 'comeintime',
														
 
															-           'area': '广东',
														
 
															-           'channel': '中标成交公告',
														
 
															-           'city': '',
														
 
															-           'comeintime': '',
														
 
															-           'competehref': None,
														
 
															-           'contenthtml': '',
														
 
															-           'detail': '',
														
 
															-           'href': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7edc7d7e9e62ca017e9f00529a7d80.html',
														
 
															-           'iscompete': True,
														
 
															-           'l_np_publishtime': '',
														
 
															-           'projectinfo': None,
														
 
															-           'publishdept': '',
														
 
															-           'publishtime': '2022-01-28 14:51:56',
														
 
															-           'sendflag': 'false',
														
 
															-           'site': '广东省政府采购网',
														
 
															-           'spidercode': 'gd_gdszfcgwxwz_zbcjgg',
														
 
															-           'title': '广东省英德监狱职工饭堂运营服务采购项目(GZSW21201FG4176A)结果公告',
														
 
															-           'type': ''},
														
 
															-  'parse': 'self.detail_get',
														
 
															-  'parse_url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7edc7d7e9e62ca017e9f00529a7d80.html',
														
 
															-  'parser_name': 'details',
														
 
															-  'pri': 1,
														
 
															-  'proxies': False,
														
 
															-  'request_params': {}},
														
 
															- {'_id': ObjectId('61f3a16381db56a59ff96877'),
														
 
															-  'author': None,
														
 
															-  'code': 0,
														
 
															-  'create_time': None,
														
 
															-  'date': '2022-01-28 15:55:14',
														
 
															-  'deal_detail': ['//div[@class="info-article in active"]'],
														
 
															-  'error': None,
														
 
															-  'ex_js': '',
														
 
															-  'ex_python': None,
														
 
															-  'failed': 3,
														
 
															-  'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
														
 
															-            'list_xpath': '//div[@class="info-article in active"]//div/a',
														
 
															-            'name_xpath': './text()',
														
 
															-            'url_key': 'http',
														
 
															-            'url_xpath': './@href'},
														
 
															-  'item': {'T': 'bidding',
														
 
															-           '_d': 'comeintime',
														
 
															-           'area': '广东',
														
 
															-           'channel': '中标成交公告',
														
 
															-           'city': '',
														
 
															-           'comeintime': '',
														
 
															-           'competehref': None,
														
 
															-           'contenthtml': '',
														
 
															-           'detail': '',
														
 
															-           'href': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7ee7dd7e9e4962017e9f56e40058a5.html',
														
 
															-           'iscompete': True,
														
 
															-           'l_np_publishtime': '',
														
 
															-           'projectinfo': None,
														
 
															-           'publishdept': '',
														
 
															-           'publishtime': '2022-01-28 14:37:36',
														
 
															-           'sendflag': 'false',
														
 
															-           'site': '广东省政府采购网',
														
 
															-           'spidercode': 'gd_gdszfcgwxwz_zbcjgg',
														
 
															-           'title': '佛山市顺德区人民法院信息化软硬件设备维护服务项目结果公告',
														
 
															-           'type': ''},
														
 
															-  'parse': 'self.detail_get',
														
 
															-  'parse_url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7ee7dd7e9e4962017e9f56e40058a5.html',
														
 
															-  'parser_name': 'details',
														
 
															-  'pri': 1,
														
 
															-  'proxies': False,
														
 
															-  'request_params': {}},
														
 
															- {'_id': ObjectId('61f295b97bdc3cbff22956e6'),
														
 
															-  'author': None,
														
 
															-  'code': 0,
														
 
															-  'create_time': None,
														
 
															-  'date': '2022-01-27 20:53:12',
														
 
															-  'deal_detail': ['//div[@class="info-article in active"]'],
														
 
															-  'error': None,
														
 
															-  'ex_js': '',
														
 
															-  'ex_python': None,
														
 
															-  'failed': 20,
														
 
															-  'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
														
 
															-            'list_xpath': '//div[@class="info-article in active"]//div/a',
														
 
															-            'name_xpath': './text()',
														
 
															-            'url_key': 'http',
														
 
															-            'url_xpath': './@href'},
														
 
															-  'item': {'T': 'bidding',
														
 
															-           '_d': 'comeintime',
														
 
															-           'area': '广东',
														
 
															-           'channel': '采购公告',
														
 
															-           'city': '',
														
 
															-           'comeintime': '',
														
 
															-           'competehref': None,
														
 
															-           'contenthtml': '',
														
 
															-           'detail': '',
														
 
															-           'href': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7efa517e9032e5017e9b37b0c50534.html',
														
 
															-           'iscompete': True,
														
 
															-           'l_np_publishtime': '',
														
 
															-           'projectinfo': None,
														
 
															-           'publishdept': '',
														
 
															-           'publishtime': '2022-01-27 19:14:45',
														
 
															-           'sendflag': 'false',
														
 
															-           'site': '广东省政府采购网',
														
 
															-           'spidercode': 'gd_gdszfcgwxwz_cggg',
														
 
															-           'title': '广东省人力资源市场设施设备购置集成及展陈布置服务项目招标公告',
														
 
															-           'type': ''},
														
 
															-  'parse': 'self.detail_get',
														
 
															-  'parse_url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7efa517e9032e5017e9b37b0c50534.html',
														
 
															-  'parser_name': 'details',
														
 
															-  'pri': 1,
														
 
															-  'proxies': False,
														
 
															-  'request_params': {}},
														
 
															- {'_id': ObjectId('61f3a15c81db56a59ff96855'),
														
 
															-  'author': None,
														
 
															-  'code': 0,
														
 
															-  'create_time': None,
														
 
															-  'date': '2022-01-28 15:55:07',
														
 
															-  'deal_detail': ['//div[@class="info-article in active"]'],
														
 
															-  'error': None,
														
 
															-  'ex_js': '',
														
 
															-  'ex_python': None,
														
 
															-  'failed': 3,
														
 
															-  'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
														
 
															-            'list_xpath': '//div[@class="info-article in active"]//div/a',
														
 
															-            'name_xpath': './text()',
														
 
															-            'url_key': 'http',
														
 
															-            'url_xpath': './@href'},
														
 
															-  'item': {'T': 'bidding',
														
 
															-           '_d': 'comeintime',
														
 
															-           'area': '广东',
														
 
															-           'channel': '电子卖场信息',
														
 
															-           'city': '',
														
 
															-           'comeintime': '',
														
 
															-           'competehref': None,
														
 
															-           'contenthtml': '',
														
 
															-           'detail': '',
														
 
															-           'href': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/9b5da8cc-1545-438c-8ad6-ccdd8bd71b10.html',
														
 
															-           'iscompete': True,
														
 
															-           'l_np_publishtime': '',
														
 
															-           'projectinfo': None,
														
 
															-           'publishdept': '',
														
 
															-           'publishtime': '2022-01-28 15:48:52',
														
 
															-           'sendflag': 'false',
														
 
															-           'site': '广东省政府采购网',
														
 
															-           'spidercode': 'gd_gdszfcgwxwz_ysgg',
														
 
															-           'title': '韶关市武江区人民检察院电子卖场直接订购成交公告',
														
 
															-           'type': ''},
														
 
															-  'parse': 'self.detail_get',
														
 
															-  'parse_url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/9b5da8cc-1545-438c-8ad6-ccdd8bd71b10.html',
														
 
															-  'parser_name': 'details',
														
 
															-  'pri': 1,
														
 
															-  'proxies': False,
														
 
															-  'request_params': {}},
														
 
															- {'_id': ObjectId('61f35a944aa3e25a12e90900'),
														
 
															-  'author': None,
														
 
															-  'code': 0,
														
 
															-  'create_time': None,
														
 
															-  'date': '2022-01-28 10:53:07',
														
 
															-  'deal_detail': ['//div[@class="info-article in active"]'],
														
 
															-  'error': None,
														
 
															-  'ex_js': '',
														
 
															-  'ex_python': None,
														
 
															-  'failed': 12,
														
 
															-  'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
														
 
															-            'list_xpath': '//div[@class="info-article in active"]//div/a',
														
 
															-            'name_xpath': './text()',
														
 
															-            'url_key': 'http',
														
 
															-            'url_xpath': './@href'},
														
 
															-  'item': {'T': 'bidding',
														
 
															-           '_d': 'comeintime',
														
 
															-           'area': '广东',
														
 
															-           'channel': '电子卖场信息',
														
 
															-           'city': '',
														
 
															-           'comeintime': '',
														
 
															-           'competehref': None,
														
 
															-           'contenthtml': '',
														
 
															-           'detail': '',
														
 
															-           'href': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/ab73c655-102a-4923-b4cb-dadfdc82c913.html',
														
 
															-           'iscompete': True,
														
 
															-           'l_np_publishtime': '',
														
 
															-           'projectinfo': None,
														
 
															-           'publishdept': '',
														
 
															-           'publishtime': '2022-01-28 10:00:38',
														
 
															-           'sendflag': 'false',
														
 
															-           'site': '广东省政府采购网',
														
 
															-           'spidercode': 'gd_gdszfcgwxwz_ysgg',
														
 
															-           'title': '广东省佛山航道事务中心电子卖场直接订购成交公告',
														
 
															-           'type': ''},
														
 
															-  'parse': 'self.detail_get',
														
 
															-  'parse_url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/ab73c655-102a-4923-b4cb-dadfdc82c913.html',
														
 
															-  'parser_name': 'details',
														
 
															-  'pri': 1,
														
 
															-  'proxies': False,
														
 
															-  'request_params': {}}]
														
 
															-                    
														
 
															-Thread-3|2022-01-28 17:06:49,723|mongo_pipeline.py|save_items|line:49|INFO| 共导出 8 条数据到 mgp_list,  新增 8条, 重复 0 条
														
 
															-Details|2022-01-28 17:06:53,273|scheduler.py|<lambda>|line:117|INFO| 
														
 
															-********** feapder end **********
														
 
															-Details|2022-01-28 17:06:53,275|scheduler.py|spider_end|line:520|INFO| 《magp:details1》爬虫结束，耗时 3分20秒
														
 
															-Details|2022-01-28 17:06:53,276|scheduler.py|delete_tables|line:444|INFO| 正在删除key magp:details1:z_spider_status
														
--- a/NoteWork/details/details.py
+++ b/NoteWork/details/details.py
@@ -1,170 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import json
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import time
														
 
															-from urllib.parse import urljoin
														
 
															-
														
 
															-import feapder
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-import execjs
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-from untils.attachment import AttachmentDownloader
														
 
															-
														
 
															-
														
 
															-class Details(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details"},sort={"item.publishtime":-1},limit=50)
														
 
															-            for item in data_lsit:
														
 
															-                print(11111)
														
 
															-                request_params = item.get("request_params")
														
 
															-                if item.get("js"):
														
 
															-                    eval(item.get("js"))
														
 
															-                if item.get("ex_python"):
														
 
															-                    exec(item.get("ex_python"))
														
 
															-                if item.get("proxies"):
														
 
															-
														
 
															-                    yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files=item.get("files"),
														
 
															-                                          deal_detail=item.get("deal_detail"),
														
 
															-                                          callback=eval(item.get("parse")),base_info=item,**request_params)
														
 
															-                else:
														
 
															-                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files=item.get("files"),
														
 
															-                                          deal_detail=item.get("deal_detail"),
														
 
															-                                          callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
														
 
															-
														
 
															-                self.to_db.delete(self.db_name,item)
														
 
															-            break
														
 
															-
														
 
															-    def detail_get(self,request,response):
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        for xpath in request.deal_detail:
														
 
															-            html = response.xpath(xpath).extract_first()  # 标书详细内容
														
 
															-            if html is not None:
														
 
															-                break
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        if request.files:
														
 
															-            files_info = request.files
														
 
															-            files =  response.xpath(files_info.get("list_xpath"))
														
 
															-            if request.files_info:
														
 
															-                files_info = request.files_info
														
 
															-                files = response.xpath(files_info.get("list_xpath"))
														
 
															-                if request.files_info:
														
 
															-                    files_info = request.files_info
														
 
															-                    files = response.xpath(files_info.get("list_xpath"))
														
 
															-                    if len(files) > 0:
														
 
															-                        attachments = {}
														
 
															-                        for index, info in enumerate(files):
														
 
															-                            file_url = info.xpath(files_info.get("url_xpath")).extract_first()
														
 
															-                            file_name = info.xpath(files_info.get("name_xpath")).extract_first()
														
 
															-                            if files_info.get("host"):
														
 
															-                                file_url = urljoin(files_info.get("host"), file_url)
														
 
															-                            if not files_info.get("file_type"):
														
 
															-                                file_type = file_url.split("?")[0].split(".")[-1].lower()
														
 
															-                            else:
														
 
															-                                file_type = files_info.get("file_type")
														
 
															-                            if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
														
 
															-                                attachment = AttachmentDownloader().fetch_attachment(
														
 
															-                                    file_name=file_name, file_type=file_type, download_url=file_url,
														
 
															-                                    enable_proxy=False)
														
 
															-                                attachments[len(attachments) + 1] = attachment
														
 
															-                        if len(attachments) == 0:
														
 
															-                            pass
														
 
															-                        else:
														
 
															-                            list_item.projectinfo = {"attachment": attachments}
														
 
															-
														
 
															-
														
 
															-        yield list_item
														
 
															-
														
 
															-    def detail_json(self,request,response):
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        exec(request.deal_detail)
														
 
															-
														
 
															-        yield list_item
														
 
															-    def detail_post(self,request,response):
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        exec(request.deal_detail)
														
 
															-
														
 
															-        yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        if response is None:
														
 
															-            code = 0
														
 
															-        else:
														
 
															-            code = response.status_code
														
 
															-        err_dic = {"200":"analysis","400":"download","500":"servers","300":"download"}
														
 
															-        if 200<=code<300:
														
 
															-            err = 'analysis'
														
 
															-        elif 300<=code<400:
														
 
															-            err = 'download'
														
 
															-        elif 400<=code<500:
														
 
															-            err = 'download'
														
 
															-        elif 500<=code:
														
 
															-            err = "servers"
														
 
															-        else:
														
 
															-            err = "timeout"
														
 
															-        mgp = MgpListItem()
														
 
															-        mgp.code=code
														
 
															-        mgp.error=err
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key,items[key])
														
 
															-        mgp.failed +=1
														
 
															-        if mgp.pri is None:
														
 
															-            mgp.pri = 0
														
 
															-
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info= f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **爬虫等级:** {mgp.pri}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Details(redis_key="magp:details1").start()
														
--- a/NoteWork/details/details_ces.py
+++ b/NoteWork/details/details_ces.py
@@ -1,180 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import time
														
 
															-from urllib.parse import urljoin
														
 
															-
														
 
															-import feapder
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-import execjs
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-from untils.attachment import AttachmentDownloader
														
 
															-
														
 
															-
														
 
															-class Details(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details","item.spidercode":"a_szsjzsczhcxpt_zbxx"},sort={"item.publishtime":-1},limit=1)
														
 
															-            for item in data_lsit:
														
 
															-                print(item.get("item"))
														
 
															-                request_params = item.get("request_params")
														
 
															-                if item.get("js"):
														
 
															-                    eval(item.get("js"))
														
 
															-                if item.get("ex_python"):
														
 
															-                    exec(item.get("ex_python"))
														
 
															-                if item.get("proxies"):
														
 
															-
														
 
															-                    yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files=item.get("files"),
														
 
															-                                          deal_detail=item.get("deal_detail"),
														
 
															-                                          callback=eval(item.get("parse")),base_info=item)
														
 
															-                else:
														
 
															-                    # print(item.get("files"))
														
 
															-                    files = {'list_xpath': '//div[@class="info-article in active"]//div/a', 'url_xpath': './@href',
														
 
															-                     'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp', 'pdf',"ddf"], 'url_key': 'http'}
														
 
															-                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files_info=files,
														
 
															-                                          deal_detail=item.get("deal_detail"),
														
 
															-                                          callback=eval(item.get("parse")), base_info=item,proxies=False)
														
 
															-
														
 
															-                self.to_db.delete(self.db_name,item)
														
 
															-            break
														
 
															-
														
 
															-    def detail_get(self,request,response):
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        for xpath in request.deal_detail:
														
 
															-            html = response.xpath(xpath).extract_first()  # 标书详细内容
														
 
															-            if html is not None:
														
 
															-                break
														
 
															-        list_item.contenthtml = html
														
 
															-        if request.files_info:
														
 
															-            files_info = request.files_info
														
 
															-            files =  response.xpath(files_info.get("list_xpath"))
														
 
															-            if len(files)>1:
														
 
															-                attachments = {}
														
 
															-                for index,info in enumerate(files):
														
 
															-                    file_url = info.xpath(files_info.get("url_xpath")).extract_first()
														
 
															-                    file_name = info.xpath(files_info.get("name_xpath")).extract_first()
														
 
															-                    if files_info.get("host"):
														
 
															-                        file_url = urljoin(files_info.get("host"), file_url)
														
 
															-                    if not files_info.get("file_type"):
														
 
															-                        file_type = file_url.split("?")[0].split(".")[-1].lower()
														
 
															-                    else:
														
 
															-                        file_type = files_info.get("file_type")
														
 
															-                    if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
														
 
															-                        attachment = AttachmentDownloader().fetch_attachment(
														
 
															-                            file_name=file_name,file_type=file_type,download_url=file_url,
														
 
															-                            enable_proxy=False)
														
 
															-                        attachments[index] = attachment
														
 
															-                list_item.projectinfo=attachments
														
 
															-            else:
														
 
															-                for info in files:
														
 
															-                    file_url = info.xpath(files_info.get("url_xpath")).extract_first()
														
 
															-                    file_name = info.xpath(files_info.get("name_xpath")).extract_first()
														
 
															-                    if files_info.get("host"):
														
 
															-                        file_url = urljoin(files_info.get("host"), file_url)
														
 
															-                    if files_info.get("file_name"):
														
 
															-                        file_name = files_info.get("file_name")
														
 
															-                    else:
														
 
															-                        file_name = file_name
														
 
															-                    if files_info.get("file_type"):
														
 
															-                        file_type = files_info.get("file_type")
														
 
															-                    else:
														
 
															-                        file_type = file_name.split("?")[0].split(".")[-1]
														
 
															-                    if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
														
 
															-                        attachment = AttachmentDownloader().fetch_attachment(
														
 
															-                            file_name=file_name, file_type=file_type, download_url=file_url,
														
 
															-                            enable_proxy=False)
														
 
															-                        list_item.projectinfo = attachment
														
 
															-
														
 
															-        yield list_item
														
 
															-
														
 
															-    def detail_json(self,request,response):
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        exec(request.deal_detail)
														
 
															-
														
 
															-        yield list_item
														
 
															-    def detail_post(self,request,response):
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        exec(request.deal_detail)
														
 
															-
														
 
															-        yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        if response is None:
														
 
															-            code = 0
														
 
															-        else:
														
 
															-            code = response.status_code
														
 
															-        err_dic = {"200":"analysis","400":"download","500":"servers","300":"download"}
														
 
															-        if 200<=code<300:
														
 
															-            err = 'analysis'
														
 
															-        elif 300<=code<400:
														
 
															-            err = 'download'
														
 
															-        elif 400<=code<500:
														
 
															-            err = 'download'
														
 
															-        elif 500<=code:
														
 
															-            err = "servers"
														
 
															-        else:
														
 
															-            err = "timeout"
														
 
															-        mgp = MgpListItem()
														
 
															-        mgp.code=code
														
 
															-        mgp.error=err
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key,items[key])
														
 
															-        mgp.failed +=1
														
 
															-        if mgp.pri is None:
														
 
															-            mgp.pri = 0
														
 
															-
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info= f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **爬虫等级:** {mgp.pri}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Details(redis_key="magp:details1").start()
														
--- a/NoteWork/details/details_cookie.py
+++ b/NoteWork/details/details_cookie.py
@@ -1,165 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:  生成一定有效期cookie，并使用的detail 详情处理方案，默认不限制ip
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-import execjs
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-
														
 
															-from untils.cookie_pool import PageCookiePool
														
 
															-import copy
														
 
															-
														
 
															-class Details(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
														
 
															-            for item in data_lsit:
														
 
															-                request_params = item.get("request_params")
														
 
															-
														
 
															-                if item.get("ex_python"):
														
 
															-                    exec(item.get("ex_python"))
														
 
															-
														
 
															-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
														
 
															-                                      deal_detail=item.get("deal_detail"),**request_params,
														
 
															-                                      callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
														
 
															-                self.to_db.delete(self.db_name,item)
														
 
															-            break
														
 
															-
														
 
															-
														
 
															-
														
 
															-    def detail_get(self,request,response):
														
 
															-        '''处理html格式的返回结果'''
														
 
															-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
														
 
															-            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        if response.code in (request.down_mid.get("code")):
														
 
															-            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        for xpath in request.deal_detail:
														
 
															-            html = response.xpath(xpath).extract_first()  # 标书详细内容
														
 
															-            if html is not None:
														
 
															-                break
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        yield list_item
														
 
															-
														
 
															-    def detail_json(self,request,response):
														
 
															-        '''处理json串及其他格式的返回结果'''
														
 
															-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
														
 
															-            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        if response.code in (request.down_mid.get("code")):
														
 
															-            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.down_mid)
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        exec(request.deal_detail)
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        if response is None:
														
 
															-            code = 0
														
 
															-        else:
														
 
															-            code = response.status_code
														
 
															-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
														
 
															-        if 200 <= code < 300:
														
 
															-            err = 'analysis'
														
 
															-        elif 300 <= code < 400:
														
 
															-            err = 'download'
														
 
															-        elif 400 <= code < 500:
														
 
															-            err = 'download'
														
 
															-        elif 500 <= code:
														
 
															-            err = "servers"
														
 
															-        else:
														
 
															-            err = "timeout"
														
 
															-        mgp = MgpListItem()
														
 
															-        mgp.code = code
														
 
															-        mgp.error = err
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key, items[key])
														
 
															-        mgp.failed += 1
														
 
															-        if mgp.pri is None:
														
 
															-            mgp.pri = 0
														
 
															-
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info = f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **爬虫等级:** {mgp.pri}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-    def download_midware(self, request):
														
 
															-        down_mid = request.down_mid
														
 
															-        key = down_mid.get("key")
														
 
															-        page_url = down_mid.get("page_url")
														
 
															-        cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-        request.cookies = cookie_pool.get_cookie()
														
 
															-        return request
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Details(redis_key="magp:details1").start()
														
--- a/NoteWork/details/details_firefox.py
+++ b/NoteWork/details/details_firefox.py
@@ -1,115 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-
														
 
															-import feapder
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-import execjs
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-
														
 
															-
														
 
															-
														
 
															-class FirefoxDetails(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_firefox"},sort={"date":-1})
														
 
															-            print(data_lsit)
														
 
															-            for item in data_lsit:
														
 
															-                print(item)
														
 
															-                request_params = item.get("request_params")
														
 
															-                if item.get("ex_python"):
														
 
															-                    exec(item.get("ex_python"))
														
 
															-
														
 
															-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
														
 
															-                                      deal_detail=item.get("deal_detail"),**request_params,
														
 
															-                                      callback=eval(item.get("parse")),base_info=item,render=True,
														
 
															-                                      render_time=item.get("render_time"))
														
 
															-                self.to_db.delete(self.db_name,item)
														
 
															-            break
														
 
															-
														
 
															-    def detail_get(self,request,response):
														
 
															-        print(response.text)
														
 
															-        items = request.item
														
 
															-        # print(items)
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        for xpath in request.deal_detail:
														
 
															-            html = response.xpath(xpath).extract_first()  # 标书详细内容
														
 
															-            if html is not None:
														
 
															-                break
														
 
															-        list_item.contenthtml = html
														
 
															-        yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        if response is None:
														
 
															-            code = 0
														
 
															-        code = response.status_code
														
 
															-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
														
 
															-        if 200 <= code < 300:
														
 
															-            err = 'analysis'
														
 
															-        elif 300 <= code < 400:
														
 
															-            err = 'download'
														
 
															-        elif 400 <= code < 500:
														
 
															-            err = 'download'
														
 
															-        elif 500 <= code:
														
 
															-            err = "servers"
														
 
															-        else:
														
 
															-            err = "timeout"
														
 
															-        mgp = MgpListItem()
														
 
															-        mgp.code = code
														
 
															-        mgp.error = err
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key, items[key])
														
 
															-        mgp.failed += 1
														
 
															-        if mgp.pri is None:
														
 
															-            mgp.pri = 0
														
 
															-
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info = f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **爬虫等级:** {mgp.pri}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
														
 
															-    # def download_midware(self, request):
														
 
															-    #     request.proxies = self.prox_pool.get()
														
 
															-    #     return request
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    FirefoxDetails(redis_key="magp:details:firefox").start()
														
--- a/NoteWork/details/details_login.py
+++ b/NoteWork/details/details_login.py
@@ -1,150 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 13:25:15
														
 
															----------
														
 
															-@summary:  生成一定有效期cookie，并使用的detail 详情处理方案，默认不限制ip
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-
														
 
															-import feapder
														
 
															-from feapder.utils.tools import wechat_warning
														
 
															-import execjs
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-from feapder.db.mongodb import MongoDB
														
 
															-
														
 
															-from untils.cookie_pool import LoginCookiePool
														
 
															-import copy
														
 
															-
														
 
															-class Details(feapder.Spider):
														
 
															-    _to_db = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    send_list = []
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        while True:
														
 
															-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
														
 
															-            for item in data_lsit:
														
 
															-                request_params = item.get("request_params")
														
 
															-                down_mid = copy.copy(item.get("down_mid"))
														
 
															-                key = down_mid.get("key")
														
 
															-                page_url = down_mid.get("page_url")
														
 
															-                cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-                down_mid["cookie_pool"] = cookie_pool
														
 
															-                print(down_mid)
														
 
															-
														
 
															-                if item.get("ex_python"):
														
 
															-                    exec(item.get("ex_python"))
														
 
															-
														
 
															-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
														
 
															-                                      deal_detail=item.get("deal_detail"),**request_params,
														
 
															-                                      callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
														
 
															-                self.to_db.delete(self.db_name,item)
														
 
															-            break
														
 
															-
														
 
															-
														
 
															-
														
 
															-    def detail_get(self,request,response):
														
 
															-        '''处理html格式的返回结果'''
														
 
															-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
														
 
															-            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.get("down_mid"))
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        if response.code in (request.down_mid.get("code")):
														
 
															-            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.get("down_mid"))
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        for xpath in request.deal_detail:
														
 
															-            html = response.xpath(xpath).extract_first()  # 标书详细内容
														
 
															-            if html is not None:
														
 
															-                break
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        yield list_item
														
 
															-
														
 
															-    def detail_json(self,request,response):
														
 
															-        '''处理json串及其他格式的返回结果'''
														
 
															-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
														
 
															-            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.get("down_mid"))
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        if response.code in (request.down_mid.get("code")):
														
 
															-            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
														
 
															-            down_mid = copy.copy(request.get("down_mid"))
														
 
															-            key = down_mid.get("key")
														
 
															-            page_url = down_mid.get("page_url")
														
 
															-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-            cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        items = request.item
														
 
															-        list_item = DataBakItem()
														
 
															-        for key in items:
														
 
															-            list_item.__setitem__(key,items[key])
														
 
															-        html = ''
														
 
															-        exec(request.deal_detail)
														
 
															-
														
 
															-        list_item.contenthtml = html
														
 
															-        yield list_item
														
 
															-
														
 
															-    def failed_request(self, request, response):
														
 
															-        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
														
 
															-        mgp = MgpListItem()
														
 
															-        items = request.base_info
														
 
															-        for key in items:
														
 
															-            mgp.__setitem__(key,items[key])
														
 
															-        mgp.failed +=1
														
 
															-        print(f'......{mgp.failed}')
														
 
															-        if mgp.pri > 5:
														
 
															-            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
														
 
															-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
														
 
															-                    '''
														
 
															-                    根据爬虫优先级报警'''
														
 
															-                    info= f'''`
														
 
															-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
														
 
															-        > **爬虫名称:** {mgp.item.get("site")}
														
 
															-        > **栏目名称:** {mgp.item.get("channel")}
														
 
															-        > **爬虫代码:** {mgp.item.get("spidercode")}
														
 
															-        > **所属管理人员:** {mgp.author}
														
 
															-        请登录剑鱼爬虫管理平台查看详情。
														
 
															-        `'''
														
 
															-                    wechat_warning(info)
														
 
															-                    self.send_list.append(mgp.item.get("site"))
														
 
															-        yield mgp
														
 
															-
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-    def download_midware(self, request):
														
 
															-        down_mid = request.down_mid
														
 
															-        key = down_mid.get("key")
														
 
															-        page_url = down_mid.get("page_url")
														
 
															-        cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
														
 
															-        request.cookies = cookie_pool.get_cookie()
														
 
															-        return request
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Details(redis_key="magp:details1").start()
														
--- a/NoteWork/details/dtcookie_pool.py
+++ b/NoteWork/details/dtcookie_pool.py
@@ -1,88 +0,0 @@
 
															-import json
														
 
															-import re
														
 
															-import sys
														
 
															-
														
 
															-import execjs
														
 
															-
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-from untils.cookie_pool import PageCookiePool
														
 
															-import requests
														
 
															-
														
 
															-
														
 
															-class DTCookiePool(PageCookiePool):
														
 
															-    def __init__(self,redis_key,header,page_url=None,
														
 
															-        min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs):
														
 
															-        super(DTCookiePool, self).__init__(redis_key,page_url=None,
														
 
															-        min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs)
														
 
															-        self.headers=header
														
 
															-        self.page_url = page_url
														
 
															-
														
 
															-    def create_cookie(self,):
														
 
															-        session = requests.Session()
														
 
															-        start_url = self.page_url
														
 
															-        print(self.headers)
														
 
															-        res = session.get(start_url, headers=self.headers,verify=False)
														
 
															-        js_func = re.findall("document.cookie=(.*?)location.href", res.text)[0]
														
 
															-        js_func = 'function sd() { return ' + js_func + "}"
														
 
															-        ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
														
 
															-        ss = ctx.call("sd")
														
 
															-        cookies = {}
														
 
															-
														
 
															-        for item in ss.split(";"):
														
 
															-            if '=' in item:
														
 
															-                cookies[item.split("=")[0]] = item.split("=")[-1]
														
 
															-        res = session.get(start_url, cookies=cookies, headers=self.headers)
														
 
															-        js_do_data = re.findall('};go\((.*?)\)', res.text)[0]
														
 
															-        js_func = re.sub("<(/*?)script>", "", res.text)
														
 
															-        location = re.compile('location(.*?)}else')
														
 
															-        setTimeout = re.compile('_(.{37})setTimeout(.*?)document')
														
 
															-        setTimeout2 = re.compile('setTimeout(.*?)document')
														
 
															-        gox = re.compile('};go(.*?)\)')
														
 
															-        js_func = re.sub(location, "}else", js_func)
														
 
															-        js_func = re.sub(setTimeout, "       document", js_func)
														
 
															-        js_func = re.sub(setTimeout2, "       document", js_func)
														
 
															-        js_func = re.sub(gox, "   return document['cookie']\n};", js_func)
														
 
															-        js_func = '''const jsdom = require("jsdom");
														
 
															-        const {JSDOM} = jsdom;
														
 
															-        const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
														
 
															-        window = dom.window;
														
 
															-        document = window.document;''' + js_func
														
 
															-        ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
														
 
															-        with open('ex_js.js', 'w+', encoding='utf-8') as f:
														
 
															-            f.write(js_func)
														
 
															-        print(js_do_data)
														
 
															-        ss = ctx.call("go", json.loads(js_do_data))
														
 
															-
														
 
															-        for item in ss.split(";"):
														
 
															-            if '=' in item:
														
 
															-                cookies[item.split("=")[0]] = item.split("=")[-1]
														
 
															-                session.cookies.setdefault(item.split("=")[0], item.split("=")[-1])
														
 
															-        res = session.get(start_url, headers=self.headers, cookies=cookies)
														
 
															-        cookies = requests.utils.dict_from_cookiejar(session.cookies)
														
 
															-        return cookies
														
 
															-
														
 
															-if __name__ == '__main__':
														
 
															-    headers = {
														
 
															-    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
														
 
															-    "Accept-Encoding": "gzip, deflate, br",
														
 
															-    "Accept-Language": "zh-CN,zh;q=0.9",
														
 
															-    "Cache-Control": "max-age=0",
														
 
															-    "Connection": "keep-alive",
														
 
															-    "Host": "www.hefei.gov.cn",
														
 
															-    "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"96\", \"Google Chrome\";v=\"96\"",
														
 
															-    "sec-ch-ua-mobile": "?0",
														
 
															-    "sec-ch-ua-platform": "\"Windows\"",
														
 
															-    "Sec-Fetch-Dest": "document",
														
 
															-    "Sec-Fetch-Mode": "navigate",
														
 
															-    "Sec-Fetch-Site": "none",
														
 
															-    "Sec-Fetch-User": "?1",
														
 
															-    "Upgrade-Insecure-Requests": "1",
														
 
															-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
														
 
															-}
														
 
															-
														
 
															-    cookie_pool = DTCookiePool(
														
 
															-        page_url='https://www.hefei.gov.cn/public/column/5921?catId=6721141&nav=3&action=list&type=4&pageIndex=2',
														
 
															-        header=headers, redis_key="dongtaices")
														
 
															-    cookie = cookie_pool.get_cookie()
														
 
															-    print(cookie)
														
 
															-    # cookie_pool.del_cookie(cookie)
														
--- a/NoteWork/details/file/sj.js
+++ b/NoteWork/details/file/sj.js
--- a/NoteWork/details/迁移.py
+++ b/NoteWork/details/迁移.py
@@ -1,34 +0,0 @@
 
															-from feapder.db.mongodb import MongoDB
														
 
															-
														
 
															-
														
 
															-class Details:
														
 
															-    _to_db = None
														
 
															-    _to_db_xs = None
														
 
															-    db_name = 'mgp_list'
														
 
															-    # 定义mongo链接
														
 
															-    @property
														
 
															-    def to_db(self):
														
 
															-        if not self._to_db:
														
 
															-            self._to_db = MongoDB()
														
 
															-        return self._to_db
														
 
															-
														
 
															-    @property
														
 
															-    def to_db_xs(self):
														
 
															-        if not self._to_db_xs:
														
 
															-            self._to_db_xs = MongoDB(port=27001)
														
 
															-        return self._to_db_xs
														
 
															-    def main(self):
														
 
															-        data_lsit = self.to_db.find(self.db_name, {"parser_name": "details"},sort={"date":-1})
														
 
															-        for item in data_lsit:
														
 
															-            # print(item.get("item").get("publishtime"))
														
 
															-            print(item.get("date"))
														
 
															-            del item["_id"]
														
 
															-            # # print(item)
														
 
															-            if item.get("item").get("publishtime") > '2021-12-15 09:12:43':
														
 
															-                print(item)
														
 
															-            else:
														
 
															-                # self.to_db_xs.add(self.db_name, item)
														
 
															-                self.to_db.delete(self.db_name, item)
														
 
															-            # self.to_db.delete(self.db_name, item)
														
 
															-
														
 
															-Details().main()
														
--- a/spiders/__init__.py
+++ b/spiders/__init__.py
--- a/spiders/李宗泽/__init__.py
+++ b/spiders/李宗泽/__init__.py
--- a/spiders/马国鹏/__init__.py
+++ b/spiders/马国鹏/__init__.py
--- a/spiders/马国鹏/中国南方航空采购招标网.py
+++ b/spiders/马国鹏/中国南方航空采购招标网.py
@@ -1,88 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-21 16:19:50
														
 
															----------
														
 
															-@summary:中国南方航空采购招标网.py
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem
														
 
															-from untils.proxy_pool import ProxyPool
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-
														
 
															-class Zgnfhkcgzbw(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         self.count = 0
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'url', 'crawl_page'])
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('其它公告', 'a_zgnfhkcgzbw_cggg',
														
 
															-                  'https://csbidding.csair.com/cms/channel/qtgg/index.htm', 1),
														
 
															-             Menu('非招标采购-采购结果', 'a_zgnfhkcgzbw_cgjg',
														
 
															-                  'https://csbidding.csair.com/cms/channel/cgjg/index.htm', 2),
														
 
															-             Menu('招标公告', 'a_zgnfhkcgzbw_zbgg',
														
 
															-                  'https://csbidding.csair.com/cms/channel/zbgg/index.htm', 1),
														
 
															-             Menu('中标公告', 'a_zgnfhkcgzbw_zhbgg',
														
 
															-                  'https://csbidding.csair.com/cms/channel/bidzbgg/index.htm', 1),
														
 
															-             Menu('评标公示', 'a_zgnfhkcgzbw_pbgs',
														
 
															-                  'https://csbidding.csair.com/cms/channel/pbgs/index.htm', 1),
														
 
															-             Menu('非招标采购-采购公告', 'a_zgnfhkcgzbw_fzbcg_cggg',
														
 
															-                  'https://csbidding.csair.com/cms/channel/cggg/index.htm', 2),
														
 
															-             Menu('非招标采购-其它公告', 'a_zgnfhkcgzbw_qtgg',
														
 
															-                  'https://csbidding.csair.com/cms/channel/fzbqtgg/index.htm', 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-            for page in range(1,menu.crawl_page+1):
														
 
															-                start_url = menu.url + f'?pageNo={page}'
														
 
															-                yield feapder.Request(url=start_url, item=menu._asdict())
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        menu = request.item
														
 
															-        self.count += 1   # 一个计数器
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.xpath("//ul[@id='list1']/li")
														
 
															-        for info in info_list:
														
 
															-            href = info.xpath('./a/@href').extract_first()
														
 
															-            title = info.xpath('./a/@title').extract_first()
														
 
															-            # import pdb
														
 
															-            # pdb.set_trace()
														
 
															-            # print(info.xpath('./a/text()'))
														
 
															-            create_time = info.xpath('./a/em/text()').extract_first()
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time.strip()  # 标书发布时间
														
 
															-            data_item.site = "中国南方航空采购招标网"
														
 
															-            data_item.area = "全国"  # 城市默认:全国
														
 
															-            data_item.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.__table_name__= 'mgp_list'
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//div[@class="main-text"]']
														
 
															-            # list_item.create_time = '//div[@class="article-author"]/text()[-1]'
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Zgnfhkcgzbw(redis_key="fwork:Zgnfhkcgzbw").start()
														
--- a/spiders/马国鹏/中国石化物质采购电子商务平台.py
+++ b/spiders/马国鹏/中国石化物质采购电子商务平台.py
@@ -1,75 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-16 15:53:39
														
 
															----------
														
 
															-@summary:中国石化物质采购电子商务平台.py
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem
														
 
															-from untils.proxy_pool import ProxyPool
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-from feapder.utils.tools import timestamp_to_date
														
 
															-
														
 
															-
														
 
															-class Zshcg(feapder.Spider):
														
 
															-    # 自定义数据库，若项目中有setting.py文件，此自定义可删除
														
 
															-    def start_callback(self):
														
 
															-         self.count = 0
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('	独家采购公示', 'a_zgshwzcgdzswpt_djcggs', "Notice", 2),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-             for page in range(1,menu.crawl_page+1):
														
 
															-                start_url = f'https://ec.sinopec.com/f/supp/bid/queryOnlyBill.do?pageNo={page}&_=1639640334801'
														
 
															-                yield feapder.Request(url=start_url, item=menu._asdict())
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        menu = request.item
														
 
															-        self.count += 1   # 一个计数器
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        # print(response.json)
														
 
															-        info_list = response.json.get("result").get("result")
														
 
															-        # return
														
 
															-        for info in info_list:
														
 
															-            href = f'https://ec.sinopec.com/f/supp/notice/viewArticle.do?id={info.get("id")}'
														
 
															-            title =info.get("title")
														
 
															-            create_time = timestamp_to_date(info.get("createdate").get("time")/1000)
														
 
															-
														
 
															-            list_item = DataBakItem()  # 存储数据的管道
														
 
															-            list_item.href = href  # 标书链接
														
 
															-            list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            list_item.title = title  # 标题
														
 
															-            list_item.publishtime = create_time  # 标书发布时间
														
 
															-            list_item.site = "中国石化物资采购电子商务平台"
														
 
															-            list_item.area = "全国"  # 城市默认:全国
														
 
															-            list_item.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            mgp =  MgpListItem()
														
 
															-            mgp.parse = "self.detail_get"
														
 
															-            mgp.parser_name = "details"
														
 
															-            mgp.item = list_item.to_dict
														
 
															-            mgp.deal_detail = ['//div[@class="wrap"]','//div[@id="middle"]']
														
 
															-            mgp.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-        #     yield mgp
														
 
															-        # dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Zshcg(redis_key="fwork:{spider_name}").start()
														
 
															-    # print(timestamp_to_date(1639635843,time_format="%Y-%m-%d %H:%M:%S"))
														
--- a/spiders/马国鹏/中泰集团招标投标网.py
+++ b/spiders/马国鹏/中泰集团招标投标网.py
@@ -1,98 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-02-17 09:39:39
														
 
															----------
														
 
															-@summary: 中泰集团招标投标网
														
 
															----------
														
 
															-@author: maguopemng
														
 
															-"""
														
 
															-import sys
														
 
															-
														
 
															-from requests_toolbelt import MultipartEncoder
														
 
															-
														
 
															-
														
 
															-sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider')
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem,ListItem
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-from untils.clean_html.defaults import cleaner
														
 
															-
														
 
															-
														
 
															-
														
 
															-class AZtjtzbtbwXxggQb(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-         self.site= "中泰集团招标投标网"
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('信息公告-全部', 'a_ztjtzbtbw_xxgg_qb', "自定义参数", 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-             for page in range(1,menu.crawl_page+1):
														
 
															-                 start_url = f'http://scm.zthx.com/srm-pb-web/portalBulletinNoAuth/listByPageNoAuth'
														
 
															-                 multipart_data = MultipartEncoder(
														
 
															-                     fields={
														
 
															-                     "Q_EQ_bidTypeValue": "",
														
 
															-                     "Q_EQ_noticeTypeValue": "",
														
 
															-                     "Quick_value": "",
														
 
															-                     "S_releaseDate": "desc",
														
 
															-                     "page": "2",
														
 
															-                     "rows": "15"
														
 
															-                 })
														
 
															-                 headers = {
														
 
															-                     "Content-Type": multipart_data.content_type
														
 
															-                 }
														
 
															-
														
 
															-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False,
														
 
															-                                       data=multipart_data,headers=headers)
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        print(response.text)
														
 
															-        menu = request.item
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.json.get("rows")
														
 
															-        for info in info_list:
														
 
															-            href = f'http://scm.zthx.com/?id={info.get("id")}'
														
 
															-            title = info.get("title")
														
 
															-            create_time = info.get("releaseDate")
														
 
															-            html = info.get("mainBody")
														
 
															-            result = cleaner(html)
														
 
															-            area = "全国"  # 省份
														
 
															-            city = ""  # 城市
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = self.site
														
 
															-            data_item.area = area  # 城市默认:全国
														
 
															-            data_item.city = city  # 城市 默认为空
														
 
															-            data_item.contenthtml = html  # 城市 默认为空
														
 
															-            data_item.detail = result  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-
														
 
															-            yield data_item
														
 
															-        list = ListItem()
														
 
															-        list.site = self.site
														
 
															-        list.channel = menu.get("channel")
														
 
															-        list.spidercode = menu.get("code")
														
 
															-        list.url = request.url
														
 
															-        list.count = len(info_list)
														
 
															-        list.rel_count = len(href_list)
														
 
															-        yield list
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    AZtjtzbtbwXxggQb(redis_key="maguopemng:AZtjtzbtbwXxggQb").start()
														
--- a/spiders/马国鹏/中铁鲁班商务网.py
+++ b/spiders/马国鹏/中铁鲁班商务网.py
@@ -1,133 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-01-20 13:49:04
														
 
															----------
														
 
															-@summary: Zglbsww
														
 
															----------
														
 
															-@author: dist
														
 
															-"""
														
 
															-import json
														
 
															-import sys
														
 
															-
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem,ListItem
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-
														
 
															-class Zglbsww(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'purchaseType',"orders", 'crawl_page'])
														
 
															-         self.site= "中铁鲁班商务网"
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('公告补遗-招标采购', 'a_ztlbsww_zhbgg', "CRFQ","publish_time", 1),
														
 
															-             Menu('公告补遗-询价采购', 'a_ztlbsww_ggby_xjcg', "XJFQ","publish_time", 1),
														
 
															-             Menu('公告补遗-竞争性谈判', 'a_ztlbsww_cqby', "TPFQ","publish_time", 1),
														
 
															-             Menu('公告补遗-竞价采购', 'a_ztlbsww_ggby_jjcg', "JJFQ","publish_time", 1),
														
 
															-
														
 
															-             Menu('采购公告-招标采购', 'a_ztlbsww_zbgg', "CRFQ","pub_time", 1),
														
 
															-             Menu('采购公告-询价采购', 'a_ztlbsww_lsxjcg', "XJFQ","pub_time", 1),
														
 
															-             Menu('采购公告-竞争性谈判', 'a_ztlbsww_jzxtp', "TPFQ","pub_time", 1),
														
 
															-             Menu('采购公告-竞价采购', 'a_ztlbsww_jjcg', "JJFQ","pub_time", 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-             for page in range(1,menu.crawl_page+1):
														
 
															-                 '''
														
 
															-                 https://eproport.crecgec.com/epu-portal/portal/project/listWithPage
														
 
															-                 https://eproport.crecgec.com/epu-portal/portal/project/listWithPage
														
 
															-                 '''
														
 
															-                 start_url = f'https://eproport.crecgec.com/epu-portal/portal/project/listWithPage'
														
 
															-                 data = {
														
 
															-                     "timeType": "month",
														
 
															-                     "areaCode": "-1",
														
 
															-                     "mainType": "-1",
														
 
															-                     "purchaser": None,
														
 
															-                     "information": None,
														
 
															-                     "sTime": "",
														
 
															-                     "eTime": "",
														
 
															-                     "classify": "-1",
														
 
															-                     "region": "-1",
														
 
															-                     "level": "",
														
 
															-                     "selectedState": "",
														
 
															-                     "purchaseType": menu.purchaseType,
														
 
															-                     "noticeType": 1,
														
 
															-                     "orders": menu.orders,
														
 
															-                     "dirs": "desc",
														
 
															-                     "current": page,
														
 
															-                     "size": 10,
														
 
															-                     "page": {}
														
 
															-                 }
														
 
															-                 data = json.dumps(data)
														
 
															-
														
 
															-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False,method="POST",data=data)
														
 
															-    def parse(self, request, response):
														
 
															-        print(response.text)
														
 
															-        menu = request.item
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.json.get("data").get("records")
														
 
															-        for info in info_list:
														
 
															-            projectid = info.get("projectId")
														
 
															-            tenantid = info.get("tenantId")
														
 
															-            href = f'https://eproport.crecgec.com/#/notice/noticexj-detail?projectId={projectid}&tenantId={tenantid}'
														
 
															-            title = info.get("projectName")
														
 
															-            create_time = info.get("publishTime") + ":00"
														
 
															-            area = "全国"  # 省份
														
 
															-            city = ""  # 城市
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = self.site
														
 
															-            data_item.area = area  # 城市默认:全国
														
 
															-            data_item.city = city  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details_ztlbw"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//*']
														
 
															-            list_item.proxies = False
														
 
															-            list_item.render_time = 3
														
 
															-            list_item.parse_url = href
														
 
															-            list_item.pri = 1
														
 
															-            list_item.files={
														
 
															-                "list_xpath":'//div[@class="****"]/a',
														
 
															-                "url_xpath":'./@href',
														
 
															-                "name_xpath":'./text()',
														
 
															-                "files_type":('zip','doxc','ftp'), # 需要下载的附件类型
														
 
															-                # "file_type":'zip', # 默认的附件类型，用于url中未带附件类型的
														
 
															-                "url_key":'http', # 用于区别连接是否为正常附件连接的url关键词 必须携带，如无可填http
														
 
															-                # "host":'http://www.ceshi.com',  # 需要拼接url的host
														
 
															-            }
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        list = ListItem()
														
 
															-        list.site = self.site
														
 
															-        list.channel = menu.get("channel")
														
 
															-        list.spidercode = menu.get("code")
														
 
															-        list.url = request.url
														
 
															-        list.count = len(info_list)
														
 
															-        list.rel_count = len(href_list)
														
 
															-        yield list
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-    def download_midware(self, request):
														
 
															-        request.headers = {
														
 
															-
														
 
															-            "Content-Type": "application/json"
														
 
															-        }
														
 
															-if __name__ == "__main__":
														
 
															-    Zglbsww(redis_key="dist:Zglbsww").start()
														
--- a/spiders/马国鹏/亿企优采.py
+++ b/spiders/马国鹏/亿企优采.py
@@ -1,105 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-02-16 09:23:14
														
 
															----------
														
 
															-@summary: Yqyc
														
 
															----------
														
 
															-@author: maguopemng
														
 
															-"""
														
 
															-import sys
														
 
															-
														
 
															-sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider')
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem, MgpListItem, ListItem
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-from requests_toolbelt import MultipartEncoder
														
 
															-import json
														
 
															-
														
 
															-
														
 
															-class Yqyc(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-        Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-        self.site = "亿企优采"
														
 
															-
														
 
															-        self.menus = [
														
 
															-            Menu('竞价结果列表', 'a_yqyc_jjjglb', "bidResultList", 1),
														
 
															-            Menu('待竞价列表', 'a_yqyc_djjlb', "biddingList", 1),
														
 
															-        ]
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        for menu in self.menus:
														
 
															-            for page in range(1, menu.crawl_page + 1):
														
 
															-                start_url = f'http://www.vins.com.cn/business/login/{menu.types}'
														
 
															-                multipart_data = MultipartEncoder(
														
 
															-                    fields={"page": json.dumps(
														
 
															-                        {"numPerPage": 10, "pageNum": page, "condition": "LIKE", "keyword": "f01", "searchValue": "",
														
 
															-                         "orderField": "", "orderDirection": "", "filterParams": {}})})
														
 
															-                headers = {
														
 
															-                    "Content-Type": multipart_data.content_type
														
 
															-                }
														
 
															-                yield feapder.Request(url=start_url, item=menu._asdict(), proxies=False,
														
 
															-                                      data=multipart_data, method="POST", headers=headers)
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        print(response.text)
														
 
															-        menu = request.item
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.json.get("data").get("records")
														
 
															-        for info in info_list:
														
 
															-            if menu.get("types")=="biddingList":
														
 
															-                href = f'http://www.vins.com.cn/business/bidingDetail?fid={info.get("f14")}&school={info.get("f04")}'
														
 
															-                title = f'待竞价详细（{info.get("f01")}）---- {info.get("f05")}'
														
 
															-                create_time = info.get("f07")
														
 
															-            else:
														
 
															-                href = f'http://www.vins.com.cn/business/bidResultDetail?fid={info.get("f14")}&school={info.get("f04")}'
														
 
															-                title = f'竞价结果详细（{info.get("f01")}）---- {info.get("f05")}'
														
 
															-                create_time = info.get("f25")
														
 
															-
														
 
															-
														
 
															-            area = "全国"  # 省份
														
 
															-            city = ""  # 城市
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = self.site
														
 
															-            data_item.area = area  # 城市默认:全国
														
 
															-            data_item.city = city  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item = MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details_firefox"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//div[@class="contentWrapper"]']
														
 
															-            list_item.proxies = False
														
 
															-            list_item.render_time = 3
														
 
															-            list_item.parse_url = href
														
 
															-            list_item.pri = 1
														
 
															-
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        list = ListItem()
														
 
															-        list.site = self.site
														
 
															-        list.channel = menu.get("channel")
														
 
															-        list.spidercode = menu.get("code")
														
 
															-        list.url = request.url
														
 
															-        list.count = len(info_list)
														
 
															-        list.rel_count = len(href_list)
														
 
															-        yield list
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Yqyc(redis_key="maguopemng:Yqyc").start()
														
--- a/spiders/马国鹏/华润置地华东大区网站.py
+++ b/spiders/马国鹏/华润置地华东大区网站.py
@@ -1,76 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-01-04 13:45:21
														
 
															----------
														
 
															-@summary:华润置地华东大区网站
														
 
															----------
														
 
															-@author: topnet
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem
														
 
															-from untils.proxy_pool import ProxyPool
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-
														
 
															-class Hrzdhddqwz(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         self.count = 0
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('Hrzdhddqwz', 'Hrzdhddqwz', "Notice", 1),
														
 
															-             # Menu('Hrzdhddqwz', 'Hrzdhddqwz', "Notice", 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-            start_url = f'https://sh.crland.com.cn/shanghai1/index.html'
														
 
															-            yield feapder.Request(url=start_url, item=menu._asdict())
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        # print(response.text)
														
 
															-        menu = request.item
														
 
															-        self.count += 1   # 一个计数器
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.xpath("//div[@class='east-tender']//tr[position()>1]")
														
 
															-        for info in info_list:
														
 
															-            href = info.xpath('./td[2]/a/@href').extract_first()
														
 
															-            title = info.xpath('./td[2]/a/text()').extract_first()
														
 
															-            create_time = info.xpath('./td[4]/text()').extract_first()
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = "华润置地华东大区网站"
														
 
															-            data_item.area = "上海市"  # 城市默认:全国
														
 
															-            data_item.city = "上海市"  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_json"  # 虽然用的json方法，但处理的不是json型数据，因为title需要重查
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = '''
														
 
															-title = response.xpath('//div[@class="east-news-detail-title"]/text()').extract_first()
														
 
															-html = response.xpath('//div[@class="east-news-detail-bottom"]').extract_first()
														
 
															-list_item.title = title
														
 
															-list_item.contenthtml = html
														
 
															-            '''
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Hrzdhddqwz(redis_key="fwork:Hrzdhddqwz").start()
														
--- a/spiders/马国鹏/南通市如皋市政府采购网上商城.py
+++ b/spiders/马国鹏/南通市如皋市政府采购网上商城.py
@@ -1,120 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-02-18 13:18:40
														
 
															----------
														
 
															-@summary: 	南通市如皋市政府采购网上商城
														
 
															----------
														
 
															-@author: maguopemng
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider')
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem,ListItem
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-
														
 
															-class JsNtsrgszfcgwssc(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-         self.site= "南通市如皋市政府采购网上商城"
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('分散公告', 'js_ntsrgszfcgwssc_fsgg', "自定义参数", 1),
														
 
															-             # Menu('JsNtsrgszfcgwssc抓取栏目', 'JsNtsrgszfcgwssc爬虫code', "Notice", 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-             for page in range(1,menu.crawl_page+1):
														
 
															-                 start_url = f'http://rugao.ntzfcg.cn/cgr_articles.html?category_id=5&page={page}'
														
 
															-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False)
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        menu = request.item
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.xpath('//ul[@class="list_main"]/li')
														
 
															-        for info in info_list:
														
 
															-            href = info.xpath('./a/@href').extract_first()
														
 
															-            title = info.xpath('./a/text()').extract()[-1].strip()
														
 
															-            create_time = "20" + info.xpath('./a/span/text()').extract_first().strip()
														
 
															-            area = "江苏"  # 省份
														
 
															-            city = "南通市"  # 城市
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = self.site
														
 
															-            data_item.area = area  # 城市默认:全国
														
 
															-            data_item.city = city  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//div[@class="nes_details"]']
														
 
															-            list_item.proxies = False
														
 
															-            list_item.ex_python = '''
														
 
															-js_str="""function randomString(e) {
														
 
															-    e = e || 32;
														
 
															-    var t = "ABCDEFGHJKMNPQRSTWXYZabcdefhijkmnprstwxyz2345678"
														
 
															-      , n = t.length
														
 
															-      , o = "";
														
 
															-    for (i = 0; i < e; i++)
														
 
															-        o += t.charAt(Math.floor(Math.random() * n));
														
 
															-    return o
														
 
															-}
														
 
															-function undefind_function(nowtimes) {
														
 
															-    for (var e = nowtimes, t = (new Date).getSeconds(), i = 100 * Number(e) + t, n = parseInt(Number(i) / 1e3), o = new Array(4), r = 3; 0 <= r; r--)
														
 
															-        3 == r ? o[3] = Number(i) % 1e3 : (o[r] = n % 1e3,
														
 
															-        n = parseInt(n / 1e3));
														
 
															-    var s = o.map(function(e) {
														
 
															-        var t, i = [1, 3, 5, 7, 9], n = [0, 2, 4, 6, 8];
														
 
															-        return e < 480 ? (e = 1e3 - e,
														
 
															-        t = i[Math.floor(Math.random() * i.length)]) : t = n[Math.floor(Math.random() * n.length)],
														
 
															-        (randomString(2) + e.toString(16) + t).toUpperCase()
														
 
															-    }).join("-")
														
 
															-      , a = parseInt(t / 10)
														
 
															-      , l = t % 10
														
 
															-      , c = a * l * 100 + 10 * (a + 1) + (9 - l);
														
 
															-    return "_new_rugao_session=" + s + "-" + randomString(4) + c
														
 
															-}"""
														
 
															-ctx = execjs.compile(js_str)
														
 
															-cookie = ctx.call("undefind_function",str(int(time.time())))  
														
 
															-request_params["headers"]={"Cookie":cookie}         
														
 
															-
														
 
															-'''
														
 
															-            list_item.parse_url = href
														
 
															-            list_item.pri = 1
														
 
															-            list_item.files={
														
 
															-                "list_xpath":'//span[@class="font16 cgr_ar_content  mb29"]/a',
														
 
															-                "url_xpath":'./@href',
														
 
															-                "name_xpath":'./text()',
														
 
															-                "file_type":'file_name',
														
 
															-                "files_type":('zip','doxc','ftp','rar','pdf','xlsx','doc','jpg'), # 需要下载的附件类型
														
 
															-                "url_key":'attachments', # 用于区别连接是否为正常附件连接的url关键词 必须携带，如无可填http
														
 
															-                "host":'http://rugao.ntzfcg.cn',  # 需要拼接url的host
														
 
															-            }
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        list = ListItem()
														
 
															-        list.site = self.site
														
 
															-        list.channel = menu.get("channel")
														
 
															-        list.spidercode = menu.get("code")
														
 
															-        list.url = request.url
														
 
															-        list.count = len(info_list)
														
 
															-        list.rel_count = len(href_list)
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    JsNtsrgszfcgwssc(redis_key="maguopemng:JsNtsrgszfcgwssc").start()
														
--- a/spiders/马国鹏/天津市政府采购网.py
+++ b/spiders/马国鹏/天津市政府采购网.py
@@ -1,101 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-13 10:04:03
														
 
															----------
														
 
															-@summary:天津市政府采购网.py
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem
														
 
															-from untils.proxy_pool import ProxyPool
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-from feapder.utils.tools import format_date
														
 
															-import time
														
 
															-
														
 
															-class Tjszf(feapder.Spider):
														
 
															-    # 自定义数据库，若项目中有setting.py文件，此自定义可删除
														
 
															-    def start_callback(self):
														
 
															-         self.count = 0
														
 
															-         self.prox_pool = ProxyPool()
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'id','st', 'crawl_page'])
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('采购公告市级',        'tj_tjszfcgw_cggg_sj', "1665", 1,   1),
														
 
															-             Menu('采购公告区县',        'tj_tjszfcgw_cggg_qj', "1664",None, 1),
														
 
															-             Menu('采购结果公告市级',     'tj_tjszfcgw_cgjggg_sj', "2014", 1,   1),
														
 
															-             Menu('采购结果公告区县',     'tj_tjszfcgw_cgjggg_qx', "2013",None, 1),
														
 
															-             Menu('采购需求征求意见市级',  'tj_tjszfcgw_cgxqzqyj_sj', "1662",1, 1),
														
 
															-             Menu('采购需求征求意见区县', 'tj_tjszfcgw_cgxqzqyj_qj', "1994", None, 1),
														
 
															-             Menu('单一来源公示-市级',    'tj_tjszfcgw_cgxqzqyj_sj', "2033", 1,   1),
														
 
															-             Menu('单一来源公示-区级',    'tj_tjszfcgw_dylygs_qx', "2034", None, 1),
														
 
															-             Menu('更正公告市级',        'tj_tjszfcgw_gzgg_sj', "1663", 1, 1),
														
 
															-             Menu('更正公告区县',        'tj_tjszfcgw_gzgg_qx', "1666", None, 1),
														
 
															-             Menu('合同验收公告市级',     'tj_tjszfcgw_htysgg_sj', "2015", 1, 1),
														
 
															-             Menu('合同验收公告区县',     'tj_tjszfcgw_htysgg_qx', "2016", None, 1),
														
 
															-             Menu('监督检查处理决定公告-市级','tj_tjszfcgw_jdjccjjdgg_sj', "5776730", 1, 1),
														
 
															-             Menu('监督检查处理决定公告-区级','tj_tjszfcgw_jdjccjjdgg_qj', "5903537", None, 1),
														
 
															-             Menu('投诉处理决定-市级',     'tj_tjszfcgw_tscljd', "5776729", None, 1),
														
 
															-             Menu('投诉处理决定公告-区级',  'tj_tjszfcgw_tscljd_qj', "5903425", None, 1),
														
 
															-             Menu('采购意向公开-市级',  'tj_tjszfcgw_cgyxgk_sj', "2021", 1, 1),
														
 
															-             Menu('采购意向公开-区级',  'tj_tjszfcgw_cgyxgk_qj', "2022", None, 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-            stmp = int(time.time()*1000)
														
 
															-            start_url = f'http://www.ccgp-tianjin.gov.cn/portal/topicView.do?method=view&view=Infor&id={menu.id}&ver=2{"&st"+str(menu.st) if menu.st else ""}&stmp={stmp}'
														
 
															-            yield feapder.Request(url=start_url, item=menu._asdict())
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        # print(response.text)
														
 
															-        info_list = response.xpath('//ul[@class="dataList"]/li')
														
 
															-        menu = request.item
														
 
															-        self.count += 1   # 一个计数器
														
 
															-        href_list = []
														
 
															-        dedup = Dedup(Dedup.BloomFilter, absolute_name="boolm:list")
														
 
															-        for info in info_list:
														
 
															-            create_time = info.xpath("./span/text()").extract_first()
														
 
															-            create_time = format_date(create_time, "%a %b %d %H:%M:%S CST %Y")
														
 
															-            href = info.xpath("./a/@href").extract_first()
														
 
															-            title = info.xpath("./a/@title").extract_first()
														
 
															-            list_item = DataBakItem()  # 存储数据的管道
														
 
															-            list_item.href = href  # 标书链接
														
 
															-            list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            list_item.title = title  # 标题
														
 
															-            list_item.publishtime = create_time  # 标书发布时间
														
 
															-            list_item.site = "天津市政府采购网"
														
 
															-            list_item.area = "天津市"  # 城市默认:全国
														
 
															-            list_item.city = "天津市"  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            mgp =  MgpListItem()
														
 
															-            mgp.parse = "self.detail_get"
														
 
															-            mgp.parser_name = "details"
														
 
															-            mgp.item = list_item.to_dict
														
 
															-            # mgp.author = '马国鹏'
														
 
															-            mgp.deal_detail = ['//table',"//div[@class='pageInner']"]
														
 
															-            mgp.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield mgp
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
														
 
															-
														
 
															-    def download_midware(self, request):
														
 
															-        request.proxies = self.prox_pool.get()
														
 
															-        return request
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Tjszf(redis_key="magp:tjszfcg").start()
														
 
															-'''
														
 
															-imageString=67&method=downEnIdFile1&id=301079006&fileId=LwQVvvUfo5A*
														
 
															-
														
 
															-'''
														
--- a/spiders/马国鹏/广东省政府采购网.py
+++ b/spiders/马国鹏/广东省政府采购网.py
@@ -1,137 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-01-18 09:41:49
														
 
															----------
														
 
															-@summary: Gdszfcgw
														
 
															----------
														
 
															-@author: dist
														
 
															-"""
														
 
															-import sys
														
 
															-from urllib.parse import urljoin
														
 
															-
														
 
															-import requests
														
 
															-
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder,time
														
 
															-from items.spider_item import DataBakItem,MgpListItem,ListItem
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-from untils.get_imgcode import get_code
														
 
															-#
														
 
															-# # custom_settings = { 'DOWNLOAD_DELAY': 10, 'CONCURRENT_REQUESTS_PER_IP': 4, 'DOWNLOADER_MIDDLEWARES': {}, }
														
 
															-# settings = { 'LOG_LEVEL': "INFO" }
														
 
															-class Gdszfcgw(feapder.Spider):
														
 
															-    def start_callback(self):
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'noticetype','notchannel', 'crawl_page'])
														
 
															-         self.site= "广东省政府采购网"
														
 
															-         self.host = 'https://gdgpo.czt.gd.gov.cn'
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('采购意向公开', 'gd_gdszfcgwxwz_cgyxgk','59','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
														
 
															-             Menu('单一来源公示', 'gd_gdszfcgwxwz_cggg_pccgyxgk','001051','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
														
 
															-             Menu('采购计划', 'gd_gdszfcgwxwz_cgjh', '001101','95ff31f3-a1af-4bc4-b1a2-54c894476193', 1),   #1
														
 
															-             Menu('采购需求', 'gd_gdszfcgwxwz_cgxq', '001059','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
														
 
															-             Menu('资格预审公告', 'gd_gdszfcgwxwz_zgysgg', '001052,001053','fca71be5-fc0c-45db-96af-f513e9abda9d', 1), #2
														
 
															-             Menu('采购公告', 'gd_gdszfcgwxwz_cggg', '00101','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
														
 
															-             Menu('中标成交公告', 'gd_gdszfcgwxwz_zbcjgg', '00102','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
														
 
															-             Menu('更正公告', 'gd_gdszfcgwxwz_gzgg', '00103','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
														
 
															-             Menu('终止公告', 'gd_gdszfcgwxwz_zzgg', '001004,001006','fca71be5-fc0c-45db-96af-f513e9abda9d', 1), #3
														
 
															-             Menu('合同公告', 'gd_gdszfcgwxwz_htgg', '001054','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
														
 
															-             Menu('验收公告', 'gd_gdszfcgwxwz_ysgg', '001009,00105A','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),  #4
														
 
															-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '201022,201023,201111,00107D','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
														
 
															-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '202022,202023,202111,00107E,001076','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
														
 
															-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '001071','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
														
 
															-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '204022,204023,204111,204112','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
														
 
															-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '001054', '3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  # 4
														
 
															-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '001009,00105A', '3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  # 4
														
 
															-
														
 
															-             # Menu('批量采购', 'gd_gdszfcgwxwz_plcg',
														
 
															-             #      'https://gdgpo.czt.gd.gov.cn/freecms/site/guangdong/dzmcgg/index.html', 1),
														
 
															-             # Menu('进口产品清单', 'gd_gdszfcgwxwz_jkcpqd',
														
 
															-             #      'https://gdgpo.czt.gd.gov.cn/freecms/site/guangdong/jkcpqd/index.html','','d7284b7e-29e9-4fe4-bad3-b187ec8edbf9' 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-        code = self.get_code()
														
 
															-        for menu in self.menus:
														
 
															-             for page in range(1,menu.crawl_page+1):
														
 
															-                 start_url = f'https://gdgpo.czt.gd.gov.cn/freecms/rest/v1/notice/selectInfoMoreChannel.do?&siteId=cd64e06a-21a7-4620-aebc-0576bab7e07a&channel={menu.notchannel}&currPage={page}&pageSize=10&noticeType={menu.noticetype}&regionCode=440001&verifyCode={code}&subChannel=false&purchaseManner=&title=&openTenderCode=&purchaser=&agency=&purchaseNature=&operationStartTime=&operationEndTime=&selectTimeName=noticeTime'
														
 
															-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False)
														
 
															-    def get_code(self):
														
 
															-        img_url = 'https://gdgpo.czt.gd.gov.cn/freecms/verify/verifyCode.do?createTypeFlag=n'
														
 
															-        header = {"Host": "www.ccgp-tianjin.gov.cn",
														
 
															-                  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0",
														
 
															-                  "Origin": "http://www.ccgp-tianjin.gov.cn",
														
 
															-
														
 
															-                  }
														
 
															-        res = requests.get(img_url, headers=header)
														
 
															-        with open('image/guangdong.jpg', 'wb+') as f:
														
 
															-            f.write(res.content)
														
 
															-        res = get_code('image/guangdong.jpg')
														
 
															-        if res.get("msg")=="success":
														
 
															-            img_code = res.get("r").get("code")
														
 
															-        else:
														
 
															-            img_code = None
														
 
															-        return img_code
														
 
															-
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        time.sleep(0.3)
														
 
															-        menu = request.item
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.json.get("data")
														
 
															-        for info in info_list:
														
 
															-            href = info.get("pageurl")
														
 
															-            title = info.get("shorttitle")
														
 
															-            create_time = info.get("addtimeStr")
														
 
															-            href = urljoin(self.host, href)
														
 
															-
														
 
															-            area = "广东"  # 省份
														
 
															-            city = ""  # 城市
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = self.site
														
 
															-            data_item.area = area  # 城市默认:全国
														
 
															-            data_item.city = city  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//div[@class="info-article in active"]']
														
 
															-            list_item.proxies = False
														
 
															-            list_item.parse_url = href
														
 
															-            list_item.pri = 1
														
 
															-            list_item.files={
														
 
															-                "list_xpath":'//div[@class="info-article in active"]//div/a',
														
 
															-                "url_xpath":'./@href',
														
 
															-                "name_xpath":'./text()',
														
 
															-                "files_type":('zip','doxc','ftp','pdf'), # 需要下载的附件类型
														
 
															-                # "file_type":'zip', # 默认的附件类型，用于url中未带附件类型的
														
 
															-                "url_key":'http', # 用于区别连接是否为正常附件连接的url关键词 必须携带，如无可填http
														
 
															-                # "host":'http://www.ceshi.com',  # 需要拼接url的host
														
 
															-            }
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        list = ListItem()
														
 
															-        list.site = self.site
														
 
															-        list.channel = menu.get("channel")
														
 
															-        list.spidercode = menu.get("code")
														
 
															-        list.url = request.url
														
 
															-        list.count = len(info_list)
														
 
															-        list.rel_count = len(href_list)
														
 
															-        yield list
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Gdszfcgw(redis_key="dist:Gdszfcgw").start()
														
--- a/spiders/马国鹏/广发证券采购平台.py
+++ b/spiders/马国鹏/广发证券采购平台.py
@@ -1,75 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-01-04 13:12:42
														
 
															----------
														
 
															-@summary: 广发证券采购平台
														
 
															----------
														
 
															-@author: topnet
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-from urllib.parse import urljoin
														
 
															-
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem
														
 
															-from untils.proxy_pool import ProxyPool
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-
														
 
															-class Gfzqcgpt(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         self.count = 0
														
 
															-         self.host = 'https://gfjc.gf.com.cn'
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('公告公示信息', 'a_gfzqcgpt_gggsxx', "gonggao", 1),
														
 
															-             # Menu('Gfzqcgpt', 'Gfzqcgpt', "Notice", 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-             for page in range(1,menu.crawl_page+1):
														
 
															-                start_url = f'https://gfjc.gf.com.cn/gonggao/index_{page}.jhtml'
														
 
															-                yield feapder.Request(url=start_url, item=menu._asdict())
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        menu = request.item
														
 
															-        self.count += 1   # 一个计数器
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.xpath('//div[@class="list-news-mode"]')
														
 
															-        for info in info_list:
														
 
															-            href = urljoin(self.host, info.xpath('./div/a/@href').extract_first())
														
 
															-            title = info.xpath('./div/a/text()').extract_first()
														
 
															-            create_time = info.xpath('./div/div/span[3]/text()').extract_first()
														
 
															-            create_time = create_time.split("：")[-1]
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = "广发证券采购平台"
														
 
															-            data_item.area = "全国"  # 城市默认:全国
														
 
															-            data_item.city = ""  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//div[@class="list-news-box"]']
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Gfzqcgpt(redis_key="fwork:Gfzqcgpt").start()
														
--- a/spiders/马国鹏/杭州市公共资源交易.py
+++ b/spiders/马国鹏/杭州市公共资源交易.py
@@ -1,110 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-11-26 16:28:18
														
 
															----------
														
 
															-@summary: 杭州市公共资源交易
														
 
															----------
														
 
															-@author: 马国鹏
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-from collections import namedtuple
														
 
															-import feapder
														
 
															-import time
														
 
															-from feapder.dedup import Dedup
														
 
															-from items.spider_item import DataBakItem, MgpListItem
														
 
															-
														
 
															-
														
 
															-class Hzsggzy(feapder.Spider):
														
 
															-    # 自定义数据库，若项目中有setting.py文件，此自定义可删除
														
 
															-    def start_callback(self):
														
 
															-        self.start_url = 'https://ggzy.hzctc.hangzhou.gov.cn/SecondPage/GetNotice'
														
 
															-        self.count = 0
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-        Menu = namedtuple('Menu', ['channel', 'code', 'afficheType', 'crawl_page'])
														
 
															-        menus = [
														
 
															-            # Menu('工程建设-项目合同', 'zj_hzsggzyjyw_gcjs_xmht', "506", 1, ),
														
 
															-            # Menu('工程建设-招标文件预公示', 'zj_hzsggzyjyw_gcjs_zbwjygs', "505", 2, ),
														
 
															-            # Menu('工程建设-核准信息公告', 'zj_hzsggzyjyw_gcjs_hzxxgg', "518", 1, ),
														
 
															-            # Menu('政府采购-更正答疑', 'zj_hzsggzyjy_zfcg_gzdy2', "27", 1, ),
														
 
															-            Menu('政府采购-采购公告', 'zj_hzsggzyjy_zfcg_cggg2', "29", 2, ),
														
 
															-            Menu('综合其他-中标结果公告', 'zj_hzsggzyjyw_zhqt_zbjggg', "507", 1, ),
														
 
															-            Menu('综合其他-中标前公示', 'zj_hzsggzyjyw_zhqt_zbqgs', "37", 1, ),
														
 
															-            Menu('综合其他-答疑文件', 'zj_hzsggzyjyw_zhqt_dywj', "499",1, ),
														
 
															-            Menu('综合其他-答疑公告', 'zj_hzsggzyjyw_zhqt_dygg', "469", 1, ),
														
 
															-            Menu('综合其他-招标公告', 'zj_hzsggzyjyw_zhqt_zbgg', "34", 1, ),
														
 
															-
														
 
															-            Menu('工程建设-招标公告', 'zj_hzsggzyjy_gcjs_zbgg', "22", 1, ),
														
 
															-            Menu('工程建设-答疑文件', 'zj_hzsggzyjy_gcjs_dywj', "23", 1, ),
														
 
															-            Menu('工程建设-答疑公告', 'zj_hzsggzyjy_gcjs_dygg', "465", 1, ),
														
 
															-            Menu('工程建设-开标结果公示', 'zj_hzsggzyjy_gcjs_kbjggs', "486", 1, ),
														
 
															-            Menu('工程建设-中标前公示', 'zj_hzsggzyjy_gcjs_zhbqgs', "25", 1, ),
														
 
															-            Menu('工程建设-中标公告', 'zj_hzsggzyjy_gcjs_zbgs', "28", 1, ),
														
 
															-
														
 
															-            Menu('政府采购-意见征询', 'zj_hzsggzyjy_zfcg_yjzx', "26", 1, ),
														
 
															-            Menu('政府采购-答疑公告', 'zj_hzsggzyjy_zfcg_dygg', "466", 1, ),
														
 
															-            Menu('政府采购-结果公告', 'zj_hzsggzyjy_zfcg_jggg', "32", 1, ),
														
 
															-
														
 
															-        ]
														
 
															-        for menu in menus:
														
 
															-            for page in range(1,menu.crawl_page+1):
														
 
															-
														
 
															-                data = {
														
 
															-                    "area":"",
														
 
															-                    "afficheType":menu.afficheType,
														
 
															-                    "IsToday":"",
														
 
															-                    "title":"",
														
 
															-                    "proID":"",
														
 
															-                    "number":"",
														
 
															-                    "_search":"false",
														
 
															-                    "nd":int(time.time()*1000),
														
 
															-                    "rows":"10",
														
 
															-                    "page":page,
														
 
															-                    "sidx":"PublishStartTime",
														
 
															-                    "sord":"desc"
														
 
															-                }
														
 
															-
														
 
															-                yield feapder.Request(url=self.start_url,data=data,method="POST",item=menu._asdict(),verify=False,proxies=False)
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        menu = request.item
														
 
															-        href_list = []
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        info_list =response.json.get("rows")
														
 
															-        for info in info_list:
														
 
															-            info_id = info.get("ID")
														
 
															-            tenderno = info.get("TenderNo")
														
 
															-            title = info.get("TenderName")
														
 
															-            create_time = info.get("PublishStartTime")
														
 
															-            inner = info.get("IsInner")
														
 
															-            href = f'https://ggzy.hzctc.hangzhou.gov.cn/AfficheShow/Home?AfficheID={info_id}&IsInner={inner}&ModuleID={menu.get("afficheType")}'
														
 
															-            data_item = DataBakItem()
														
 
															-            data_item.href = href
														
 
															-            data_item.title = title
														
 
															-            data_item.publishtime = create_time
														
 
															-            data_item.channel = menu.get("channel")
														
 
															-            data_item.spidercode = menu.get("code")
														
 
															-            data_item.site = "杭州市公共资源交易"
														
 
															-            data_item.area = "浙江"
														
 
															-            data_item.city = "杭州市"
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item = MgpListItem()
														
 
															-            # list_item.__table_name__ = 'mgp_list'
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ["//div[@class='content']"]
														
 
															-            # list_item.create_time = '//div[@class="article-author"]/text()[-1]'
														
 
															-            list_item.parse_url = href
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Hzsggzy(redis_key="mgp:hzsggzy",debug=True).start()
														
--- a/spiders/马国鹏/武汉市公共资源交易平台.py
+++ b/spiders/马国鹏/武汉市公共资源交易平台.py
@@ -1,99 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2021-12-29 10:06:02
														
 
															----------
														
 
															-@summary:  武汉市公共资源交易平台
														
 
															----------
														
 
															-@author: topnet
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem
														
 
															-from untils.cookie_pool import PageCookiePool
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-
														
 
															-class Whsggzyjypt(feapder.Spider):
														
 
															-
														
 
															-    cookie_pool = PageCookiePool(redis_key='fwork:Whsggzyjypt',page_url='https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoListInit.do')
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         self.count = 0
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('资格预审公示', 'hb_whsggzyjypt_zgysgs', "Notice", 3), # 300页历史数据
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-             start_url = f'https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoList.do'
														
 
															-             for page in range(1,menu.crawl_page+1):
														
 
															-                 data = {
														
 
															-                     "page": page,
														
 
															-                     "rows": "10"
														
 
															-                 }
														
 
															-                 yield feapder.Request(url=start_url, data=data, method="POST", item=menu._asdict())
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        if '当前操作存在安全风险' in response.text:
														
 
															-            self.cookie_pool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        menu = request.item
														
 
															-        self.count += 1   # 一个计数器
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.json.get("rows")
														
 
															-        for info in info_list:
														
 
															-            href = f'https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoDetail.do?%id={info.get("id")}'
														
 
															-            title = info.get("prjName")
														
 
															-            create_time = info.get("insertDate")
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = "武汉市公共资源交易平台"
														
 
															-            data_item.area = "湖北省"  # 城市默认:全国
														
 
															-            data_item.city = "武汉市"  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            # if ss == []:
														
 
															-            #     continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details_cookie"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//div[@class="pageRight_box"]']
														
 
															-            list_item.parse_url = href
														
 
															-            list_item.down_mid = {"key":'fwork:Whsggzyjypt',"text":"当前操作存在安全风险","code":(404,500),
														
 
															-                                  "page_url":'https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoListInit.do'}
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        dedup.add(href_list)
														
 
															-    def download_midware(self, request):
														
 
															-        request.headers = {
														
 
															-            "Sec-Fetch-Mode": "cors",
														
 
															-            "Sec-Fetch-Site": "same-origin",
														
 
															-            "Origin": "https://www.whzbtb.com",
														
 
															-            "Accept-Encoding": "gzip, deflate, br",
														
 
															-            "Accept-Language": "zh-CN,zh;q=0.9",
														
 
															-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36 Core/1.77.81.400 QQBrowser/10.9.4608.400",
														
 
															-            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
														
 
															-            "Accept": "application/json, text/javascript, */*; q=0.01",
														
 
															-            "Referer": "https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoListInit.do",
														
 
															-            "X-Requested-With": "XMLHttpRequest",
														
 
															-            "Connection": "keep-alive"
														
 
															-        }
														
 
															-
														
 
															-        request.cookies = self.cookie_pool.get_cookie()
														
 
															-        return request
														
 
															-
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Whsggzyjypt(redis_key="fwork:Whsggzyjypt").start()
														
--- a/spiders/马国鹏/湖北省政府采购网.py
+++ b/spiders/马国鹏/湖北省政府采购网.py
@@ -1,132 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-02-16 11:31:01
														
 
															----------
														
 
															-@summary: HbHbszfcgwCgyxgg
														
 
															----------
														
 
															-@author: maguopemng
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider')
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem,ListItem
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-from untils.cookie_pool import PageCookiePool
														
 
															-
														
 
															-
														
 
															-class HbHbszfcgwCgyxgg(feapder.Spider):
														
 
															-    cookiepool = PageCookiePool(redis_key='fwork:gszfcg',
														
 
															-                            page_url='http://www.ccgp-hubei.gov.cn:9040/quSer/initSearch')
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-         self.site= "湖北省政府采购网"
														
 
															-         self.menus = [
														
 
															-             Menu('采购意向公告', 'hb_hbszfcgw_cgyxgg', "自定义参数", 1),
														
 
															-         ]
														
 
															-
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-             for page in range(1,menu.crawl_page+1):
														
 
															-                 start_url = f'http://www.ccgp-hubei.gov.cn:9040/quSer/search'
														
 
															-                 data = {
														
 
															-                     "queryInfo.type": "cgyx",
														
 
															-                     "queryInfo.key": "",
														
 
															-                     "queryInfo.xmmc": "",
														
 
															-                     "queryInfo.cgdw": "",
														
 
															-                     "queryInfo.city": "湖北省",
														
 
															-                     "queryInfo.qybm": "42????",
														
 
															-                     "queryInfo.district": "全省",
														
 
															-                     "queryInfo.je1": "",
														
 
															-                     "queryInfo.begin": "",
														
 
															-                     "queryInfo.end": "",
														
 
															-                     "queryInfo.pageNo": "3",
														
 
															-                     "queryInfo.pageSize": "15",
														
 
															-                     "queryInfo.pageTotle": "2950"
														
 
															-                 }
														
 
															-                 headers = {
														
 
															-                     "Content-Type": "application/x-www-form-urlencoded",
														
 
															-                 }
														
 
															-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False,data=data,method="POST",headers=headers)
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        if '查询失败，请重新查询' in response.text:
														
 
															-            self.cookiepool.del_cookie(request.cookies)
														
 
															-            yield request
														
 
															-        print(response.text)
														
 
															-        menu = request.item
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.xpath('//tbody/tr')
														
 
															-        for info in info_list:
														
 
															-            href = info.xpath('./td[last()]/a/@href').extract_first()
														
 
															-            title = info.xpath('./td[2]/text()').extract_first()
														
 
															-            create_time = info.xpath('./td[5]/text()').extract_first()
														
 
															-            area = "湖北"  # 省份
														
 
															-            city = ""  # 城市
														
 
															-            print(title,create_time,href)
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = self.site
														
 
															-            data_item.area = area  # 城市默认:全国
														
 
															-            data_item.city = city  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_json"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.proxies = False
														
 
															-            list_item.deal_detail = '''
														
 
															-html = response.xpath('//div[@style="margin: 0 22px;"]').extract_first()   
														
 
															-list_item.contenthtml=html
														
 
															-files =  response.xpath('//ul[@class="list-unstyled details-ul"]/li')
														
 
															-if len(files) > 0:
														
 
															-    attachments = {}
														
 
															-    for index, info in enumerate(files):
														
 
															-        file_id = info.xpath('./a/@href').extract_first().strip("javascript:downloadFile();Base64").strip("'")
														
 
															-        file_name = info.xpath('./a/@download').extract_first()
														
 
															-        import base64
														
 
															-        file_url = 'http://www.ccgp-hubei.gov.cn:8090/gpmispub/download?id=' + base64.b64encode(file_id.encode('utf-8')).decode()
														
 
															-        file_type = file_name.split(".")[-1].lower()
														
 
															-        file_name = file_name.split(".")[0]
														
 
															-        print(file_type)
														
 
															-        print(file_url)
														
 
															-        attachment = AttachmentDownloader().fetch_attachment(
														
 
															-            file_name=file_name, file_type=file_type, download_url=file_url,
														
 
															-            enable_proxy=False)
														
 
															-        attachments[str(len(attachments) + 1)] = attachment
														
 
															-        print(attachment)
														
 
															-        if len(attachments) == 0:
														
 
															-            pass
														
 
															-        else:
														
 
															-            list_item.projectinfo = {"attachments": attachments}         
														
 
															-            '''
														
 
															-            list_item.parse_url = href
														
 
															-            list_item.pri = 1
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        list = ListItem()
														
 
															-        list.site = self.site
														
 
															-        list.channel = menu.get("channel")
														
 
															-        list.spidercode = menu.get("code")
														
 
															-        list.url = request.url
														
 
															-        list.count = len(info_list)
														
 
															-        list.rel_count = len(href_list)
														
 
															-        dedup.add(href_list)
														
 
															-    def download_midware(self, request):
														
 
															-        request.cookies = self.cookiepool.get_cookie()
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    HbHbszfcgwCgyxgg(redis_key="maguopemng:HbHbszfcgwCgyxgg").start()
														
--- a/spiders/马国鹏/滁州市人民政府网.py
+++ b/spiders/马国鹏/滁州市人民政府网.py
@@ -1,113 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-01-14 20:02:21
														
 
															----------
														
 
															-@summary: 滁州市人民政府网
														
 
															----------
														
 
															-@author: mgp
														
 
															-"""
														
 
															-import sys
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem,ListItem
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-
														
 
															-class Czsrmzf(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-         self.site= "滁州市人民政府网"
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('政府信息公开目录-公立医疗机构药品医用设备采购', 'ah_czsrmzfw_gcztb_zbgg', "自定义参数", 1),
														
 
															-             Menu('重大建设项目-招标投标信息', 'ah_czsrmzfw_zfcg_cggg', "自定义参数", 1),
														
 
															-             Menu('政府采购', 'ah_czsrmzfw_gcztb_zbgs', "Notice", 1),
														
 
															-             Menu('工程建设招投标', 'ah_czsrmzfw_zfcg_zbcjgg', "Notice", 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-             for page in range(1,menu.crawl_page+1):
														
 
															-                 start_url = f'https://www.chuzhou.gov.cn/chuzhou/site/label/8888'
														
 
															-                 parmars = params = {
														
 
															-                        "IsAjax": "1",
														
 
															-                        "dataType": "html",
														
 
															-                        "_": "0.5840033326645138",
														
 
															-                        "labelName": "publicInfoList",
														
 
															-                        "siteId": "2653861",
														
 
															-                        "pageSize": "20",
														
 
															-                        "pageIndex": "3",
														
 
															-                        "action": "list",
														
 
															-                        "isDate": "true",
														
 
															-                        "dateFormat": "yyyy-MM-dd",
														
 
															-                        "length": "50",
														
 
															-                        "organId": "2681509",
														
 
															-                        "type": "4",
														
 
															-                        "catId": "161735369",
														
 
															-                        "cId": "",
														
 
															-                        "result": "暂无相关信息",
														
 
															-                        "title": "",
														
 
															-                        "fileNum": "",
														
 
															-                        "keyWords": "",
														
 
															-                        "file": "/c1/chuzhou/publicInfoList_newest"
														
 
															-                    }
														
 
															-                 yield feapder.Request(url=start_url,params=parmars, item=menu._asdict(),proxies=False)
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        menu = request.item
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.xpath("//ul")
														
 
															-        for info in info_list:
														
 
															-            href = info.xpath("./li/a/@href").extract_first().strip()
														
 
															-            title = info.xpath("./li/a/@title").extract_first().strip()
														
 
															-            create_time = info.xpath("./li/span/text()").extract_first().strip()
														
 
															-            area = "安徽"  # 省份
														
 
															-            city = "滁州市"  # 城市
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = self.site
														
 
															-            data_item.area = area  # 城市默认:全国
														
 
															-            data_item.city = city  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            list_item =  MgpListItem()
														
 
															-            list_item.parse = "self.detail_get"
														
 
															-            list_item.parser_name = "details"
														
 
															-            list_item.item = data_item.to_dict
														
 
															-            list_item.deal_detail = ['//div[@class="contentbox minh500"]']
														
 
															-            list_item.proxies = False
														
 
															-            list_item.parse_url = href
														
 
															-            list_item.pri = 1
														
 
															-            list_item.files={
														
 
															-                "list_xpath":'//a[contains(@data-file-ext,"D")]',
														
 
															-                "url_xpath":'./@href',
														
 
															-                "name_xpath":'./text()',
														
 
															-                "files_type":('zip','docx','ftp'), # 需要下载的附件类型
														
 
															-                "url_key": 'http',  # 用于区别连接是否为正常附件连接的url关键词 必须携带，如无可填http
														
 
															-                "host": 'https://www.chuzhou.gov.cn'
														
 
															-            }
														
 
															-            href_list.append(href)
														
 
															-            yield list_item
														
 
															-        list = ListItem()
														
 
															-        list.site = self.site
														
 
															-        list.channel = menu.get("channel")
														
 
															-        list.spidercode = menu.get("code")
														
 
															-        list.url = request.url
														
 
															-        list.count = len(info_list)
														
 
															-        list.rel_count = len(href_list)
														
 
															-        yield list
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    Czsrmzf(redis_key="magp:Czsrmzf").start()
														
--- a/spiders/马国鹏/玖隆在线_交易公告.py
+++ b/spiders/马国鹏/玖隆在线_交易公告.py
@@ -1,92 +0,0 @@
 
															-# -*- coding: utf-8 -*-
														
 
															-"""
														
 
															-Created on 2022-02-15 14:01:43
														
 
															----------
														
 
															-@summary: Jlzx
														
 
															----------
														
 
															-@author: maguopemng
														
 
															-"""
														
 
															-import sys
														
 
															-
														
 
															-
														
 
															-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
														
 
															-sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider')
														
 
															-import feapder
														
 
															-from items.spider_item import DataBakItem,MgpListItem,ListItem
														
 
															-from untils.clean_html.defaults import cleaner
														
 
															-from feapder.dedup import Dedup
														
 
															-from collections import namedtuple
														
 
															-
														
 
															-class AJlzxJygg(feapder.Spider):
														
 
															-
														
 
															-    def start_callback(self):
														
 
															-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
														
 
															-         self.site= "玖隆在线"
														
 
															-
														
 
															-         self.menus = [
														
 
															-             Menu('交易公告', 'a_jlzx_jygg', "自定义参数", 1),
														
 
															-         ]
														
 
															-    def start_requests(self):
														
 
															-         for menu in self.menus:
														
 
															-             for page in range(1,menu.crawl_page+1):
														
 
															-                 start_url = "http://www.e9656.com/portaletm-2.0.0//dataViewAjax!show.htm"
														
 
															-                 params = {
														
 
															-                     "callback": "",
														
 
															-                     "ajaxParam.esbService": "afficheService.queryAfficheAll",
														
 
															-                     "ajaxParam.esbParam": "%5B%7B%22cmemberCode%22%3A%22S000016%22%2C%22queryOrderStr1%22%3A%22afficheDate%20desc%22%7D%5D",
														
 
															-                     "paging.limit": "12",
														
 
															-                     "paging.start": "0",
														
 
															-                     "ajaxParam.retClass": "com.soft.bc.oamsg.affiche.vo.QueryAffiche",
														
 
															-                     "ajaxParam.esbParamClass": "[\"com.soft.bc.oamsg.affiche.vo.QueryBean\"]",
														
 
															-                     "ajaxParam.esbParamName": "[\"queryBean\"]",
														
 
															-                     "ajaxParam.resultParamName": "data",
														
 
															-                     "ajaxParam.callbackParam": "{\"maskPlace\":\"$(\\\"div[name='doclist'][id='jygg'],span[name='doclist'][id='jygg']\\\")\"}"
														
 
															-                 }
														
 
															-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False,params=params)
														
 
															-
														
 
															-    def parse(self, request, response):
														
 
															-        print(response.json)
														
 
															-        menu = request.item
														
 
															-        dedup = Dedup(Dedup.BloomFilter)
														
 
															-        href_list = []
														
 
															-        info_list = response.json.get("data").get("data").get("list")
														
 
															-        for info in info_list:
														
 
															-            href = f'https://www.e9656.com/trade//auctionHallAction!getOaAffiche.htm?glistTempbatch={info.get("afficheExbillno")}'
														
 
															-            title = info.get("afficheTitle")
														
 
															-            create_time = info.get("afficheEdate")
														
 
															-            html = info.get("afficheContent")
														
 
															-            result = cleaner(html)
														
 
															-
														
 
															-            area = "江苏"  # 省份
														
 
															-            city = ""  # 城市
														
 
															-
														
 
															-            data_item = DataBakItem()  # 存储数据的管道
														
 
															-            data_item.href = href  # 标书链接
														
 
															-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
														
 
															-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
														
 
															-            data_item.title = title  # 标题
														
 
															-            data_item.publishtime = create_time  # 标书发布时间
														
 
															-            data_item.site = self.site
														
 
															-            data_item.area = area  # 城市默认:全国
														
 
															-            data_item.city = city  # 城市 默认为空
														
 
															-            data_item.contenthtml = html  # 城市 默认为空
														
 
															-            data_item.detail = result  # 城市 默认为空
														
 
															-            ss = dedup.filter_exist_data([href])
														
 
															-            if ss == []:
														
 
															-                continue
														
 
															-            yield data_item
														
 
															-        list = ListItem()
														
 
															-        list.site = self.site
														
 
															-        list.channel = menu.get("channel")
														
 
															-        list.spidercode = menu.get("code")
														
 
															-        list.url = request.url
														
 
															-        list.count = len(info_list)
														
 
															-        list.rel_count = len(href_list)
														
 
															-        yield list
														
 
															-        dedup.add(href_list)
														
 
															-
														
 
															-    def end_callback(self):
														
 
															-        print("爬虫结束")
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    AJlzxJygg(redis_key="maguopemng:AJlzxJygg").start()