maguopeng 2 年之前
父节点
当前提交
c0b848b691
共有 100 个文件被更改,包括 1627 次插入9464 次删除
  1. 0 47
      Crawlb/docker-compose.yml
  2. 0 54
      Crawlb/docker-compose_work.yml
  3. 0 170
      Details/detail_cookie.py
  4. 0 117
      Details/detail_firefox.py
  5. 0 164
      Details/details.py
  6. 0 200
      Details/details_webcookie.py
  7. 0 15
      FworkSpider/details/__init__.py
  8. 0 134
      FworkSpider/details/detail_ztlbw.py
  9. 0 170
      FworkSpider/details/details.py
  10. 0 165
      FworkSpider/details/details_cookie.py
  11. 0 115
      FworkSpider/details/details_firefox.py
  12. 0 150
      FworkSpider/details/details_login.py
  13. 0 88
      FworkSpider/details/dtcookie_pool.py
  14. 0 1
      FworkSpider/details/file/sj.js
  15. 29 22
      FworkSpider/feapder/buffer/item_buffer.py
  16. 3 3
      FworkSpider/feapder/buffer/request_buffer.py
  17. 1 1
      FworkSpider/feapder/commands/create_builder.py
  18. 33 0
      FworkSpider/feapder/core/base_parser.py
  19. 6 5
      FworkSpider/feapder/core/collector.py
  20. 40 25
      FworkSpider/feapder/core/parser_control.py
  21. 62 45
      FworkSpider/feapder/core/scheduler.py
  22. 76 116
      FworkSpider/feapder/dedup/__init__.py
  23. 178 0
      FworkSpider/feapder/dedup/old__init__.py
  24. 1 3
      FworkSpider/feapder/network/cookie_pool.py
  25. 1 1
      FworkSpider/feapder/network/proxy_pool.py
  26. 83 27
      FworkSpider/feapder/network/request.py
  27. 513 0
      FworkSpider/feapder/network/request6.29.py
  28. 31 13
      FworkSpider/feapder/templates/spider_list_template.tmpl
  29. 1 1
      FworkSpider/feapder/templates/spider_template.tmpl
  30. 37 17
      FworkSpider/feapder/utils/aliyun.py
  31. 1 1
      FworkSpider/feapder/utils/email_sender.py
  32. 41 47
      FworkSpider/feapder/utils/log.py
  33. 1 1
      FworkSpider/feapder/utils/redis_lock.py
  34. 2 5
      FworkSpider/feapder/utils/tools.py
  35. 3 8
      FworkSpider/feapder/utils/webdriver.py
  36. 4 4
      FworkSpider/mongo_pipeline.py
  37. 98 0
      FworkSpider/mongo_pipeline_old.py
  38. 39 10
      FworkSpider/setting.py
  39. 141 95
      FworkSpider/untils/attachment.py
  40. 136 0
      FworkSpider/untils/cleaner.py
  41. 3 2
      FworkSpider/untils/cookie_pool.py
  42. 1 1
      FworkSpider/untils/create_menus.py
  43. 7 0
      FworkSpider/untils/get_imgcode.py
  44. 55 89
      FworkSpider/untils/tools.py
  45. 0 23
      NoteWork/cesspider/__init__.py
  46. 0 247
      NoteWork/cesspider/cesspider
  47. 0 6
      NoteWork/cesspider/hubeijianzhu.py
  48. 0 50
      NoteWork/cesspider/jiangxistouces.py
  49. 0 80
      NoteWork/cesspider/js/rsa/Barrett.js
  50. 0 614
      NoteWork/cesspider/js/rsa/BigInt.js
  51. 0 583
      NoteWork/cesspider/js/rsa/RSA.js
  52. 0 0
      NoteWork/cesspider/js/rsa/__init__.py
  53. 0 109
      NoteWork/cesspider/magpces.py
  54. 0 95
      NoteWork/cesspider/中国南方电网电子采购交易平台.py
  55. 0 133
      NoteWork/cesspider/中国鲁班商务委.py
  56. 0 76
      NoteWork/cesspider/交通银行供应商门户.py
  57. 0 91
      NoteWork/cesspider/华创迅采电子采购平台.py
  58. 0 70
      NoteWork/cesspider/国家税务总局宁波市税务局.py
  59. 0 80
      NoteWork/cesspider/城轨采购网.py
  60. 0 74
      NoteWork/cesspider/山西省招标投标协会.py
  61. 0 32
      NoteWork/cesspider/广东测试.py
  62. 0 137
      NoteWork/cesspider/广东省政府采购网.py
  63. 0 9
      NoteWork/cesspider/测试查询.py
  64. 0 114
      NoteWork/cesspider/滁州市人民政府网.py
  65. 0 197
      NoteWork/cesspider/甘肃政府采购网.py
  66. 0 213
      NoteWork/cesspider/甘肃政府采购网_ces.py
  67. 0 194
      NoteWork/cesspider/甘肃政府采购网_new.py
  68. 0 106
      NoteWork/cesspider/福建省政府采购网.py
  69. 0 24
      NoteWork/cesspider/黔云招采电子招标采购交易平台
  70. 0 93
      NoteWork/cesspider/黔云招采电子招标采购交易平台.py
  71. 0 15
      NoteWork/details/__init__.py
  72. 0 194
      NoteWork/details/detail_dtcookie.py
  73. 0 134
      NoteWork/details/detail_ztlbw.py
  74. 0 1082
      NoteWork/details/details
  75. 0 170
      NoteWork/details/details.py
  76. 0 180
      NoteWork/details/details_ces.py
  77. 0 165
      NoteWork/details/details_cookie.py
  78. 0 115
      NoteWork/details/details_firefox.py
  79. 0 150
      NoteWork/details/details_login.py
  80. 0 88
      NoteWork/details/dtcookie_pool.py
  81. 0 1
      NoteWork/details/file/sj.js
  82. 0 34
      NoteWork/details/迁移.py
  83. 0 0
      spiders/__init__.py
  84. 0 0
      spiders/李宗泽/__init__.py
  85. 0 0
      spiders/马国鹏/__init__.py
  86. 0 88
      spiders/马国鹏/中国南方航空采购招标网.py
  87. 0 75
      spiders/马国鹏/中国石化物质采购电子商务平台.py
  88. 0 98
      spiders/马国鹏/中泰集团招标投标网.py
  89. 0 133
      spiders/马国鹏/中铁鲁班商务网.py
  90. 0 105
      spiders/马国鹏/亿企优采.py
  91. 0 76
      spiders/马国鹏/华润置地华东大区网站.py
  92. 0 120
      spiders/马国鹏/南通市如皋市政府采购网上商城.py
  93. 0 101
      spiders/马国鹏/天津市政府采购网.py
  94. 0 137
      spiders/马国鹏/广东省政府采购网.py
  95. 0 75
      spiders/马国鹏/广发证券采购平台.py
  96. 0 110
      spiders/马国鹏/杭州市公共资源交易.py
  97. 0 99
      spiders/马国鹏/武汉市公共资源交易平台.py
  98. 0 132
      spiders/马国鹏/湖北省政府采购网.py
  99. 0 113
      spiders/马国鹏/滁州市人民政府网.py
  100. 0 92
      spiders/马国鹏/玖隆在线_交易公告.py

+ 0 - 47
Crawlb/docker-compose.yml

@@ -1,47 +0,0 @@
-version: '3.3'
-services:
-  master:
-    image: swordfish:v1
-    container_name: master_new
-    environment:
-       CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址,在 docker compose 网络中,直接引用服务名称
-       CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
-       CRAWLAB_SERVER_MASTER: "Y"
-       CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
-       CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
-       CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
-       CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
-       CRAWLAB_REDIS_ADDRESS: "redis"  #
-#       CRAWLAB_REDIS_ADDRESS: "172.19.0.2"  # Redis host address Redis 的地址,在 docker compose 网络中,直接引用服务名称
-       CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
-       CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
-       CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
-       CRAWLAB_SERVER_REGISTER_TYPE: "mac"
-    volumes: # 目录挂载,宿主机在前,容器在后
-      - /mnt/magp:/magp
-    ports:
-        - "8998:8080"
-
-
-#    depends_on:
-#          - redis
-
-#    deploy:
-#      resources:
-#        limits:
-#          memory: 15G
-#        reservations:
-#          memory: 1G
-
-#  mongo:
-#    image: mongo:latest
-#    restart: always
-#    ports:
-#      - "27027:27017"
-#  redis:
-#    image: redis:latest
-#    container_name: master_redis
-#    restart: always
-#    ports:
-#      - "6379:6379"
-#  wget http://download.firefox.com.cn/releases/firefox/78.14/zh-CN/Firefox-latest-x86_64.tar.bz2

+ 0 - 54
Crawlb/docker-compose_work.yml

@@ -1,54 +0,0 @@
-version: '3.3'
-services:
-  worker01:
-    image: swordfish:v1
-    container_name: crawlab_worker01
-    environment:
-      CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址,在 docker compose 网络中,直接引用服务名称
-      CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
-      CRAWLAB_SERVER_MASTER: "N"
-      CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
-      CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
-      CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
-      CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
-      CRAWLAB_REDIS_ADDRESS: "redis"  # Redis host address Redis 的地址,在 docker compose 网络中,直接引用服务名称
-      CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
-      CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
-      CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
-      CRAWLAB_SERVER_REGISTER_TYPE: "ip"
-      CRAWLAB_FS_FILER_URL: "http://101.200.210.94:8998/api/filer"
-
-  worker02:
-    image: swordfish:v1
-    container_name: crawlab_worker02
-    environment:
-      CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址,在 docker compose 网络中,直接引用服务名称
-      CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
-      CRAWLAB_SERVER_MASTER: "N"
-      CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
-      CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
-      CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
-      CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
-      CRAWLAB_REDIS_ADDRESS: "redis"  # Redis host address Redis 的地址,在 docker compose 网络中,直接引用服务名称
-      CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
-      CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
-      CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
-      CRAWLAB_SERVER_REGISTER_TYPE: "ip"
-      CRAWLAB_FS_FILER_URL: "http://101.200.210.94:8998/api/filer"
-  worker03:
-    image: swordfish:v1
-    container_name: crawlab_worker03
-    environment:
-      CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址,在 docker compose 网络中,直接引用服务名称
-      CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
-      CRAWLAB_SERVER_MASTER: "N"
-      CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
-      CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
-      CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
-      CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
-      CRAWLAB_REDIS_ADDRESS: "redis"  # Redis host address Redis 的地址,在 docker compose 网络中,直接引用服务名称
-      CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
-      CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
-      CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
-      CRAWLAB_SERVER_REGISTER_TYPE: "ip"
-      CRAWLAB_FS_FILER_URL: "http://101.200.210.94:8998/api/filer"

+ 0 - 170
Details/detail_cookie.py

@@ -1,170 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:  生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip
----------
-@author: 马国鹏
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from feapder.utils.tools import wechat_warning
-import execjs
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-
-from untils.cookie_pool import PageCookiePool
-import copy
-
-class Details(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
-            for item in data_lsit:
-                request_params = item.get("request_params")
-                down_mid = copy.copy(item.get("down_mid"))
-                key = down_mid.get("key")
-                page_url = down_mid.get("page_url")
-                cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-                down_mid["cookie_pool"] = cookie_pool
-
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-
-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
-                                      deal_detail=item.get("deal_detail"),**request_params,
-                                      callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
-                self.to_db.delete(self.db_name,item)
-            break
-
-
-
-    def detail_get(self,request,response):
-        '''处理html格式的返回结果'''
-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
-            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        if response.code in (request.down_mid.get("code")):
-            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-
-        list_item.contenthtml = html
-        yield list_item
-
-    def detail_json(self,request,response):
-        '''处理json串及其他格式的返回结果'''
-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
-            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        if response.code in (request.down_mid.get("code")):
-            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        exec(request.deal_detail)
-
-        list_item.contenthtml = html
-        yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        if response is None:
-            code = 0
-        else:
-            code = response.status_code
-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
-        if 200 <= code < 300:
-            err = 'analysis'
-        elif 300 <= code < 400:
-            err = 'download'
-        elif 400 <= code < 500:
-            err = 'download'
-        elif 500 <= code:
-            err = "servers"
-        else:
-            err = "timeout"
-        mgp = MgpListItem()
-        mgp.code = code
-        mgp.error = err
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key, items[key])
-        mgp.failed += 1
-        if mgp.pri is None:
-            mgp.pri = 0
-
-        if mgp.pri > 5:
-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info = f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **爬虫等级:** {mgp.pri}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-
-    def end_callback(self):
-        print("爬虫结束")
-    def download_midware(self, request):
-        down_mid = request.down_mid
-        key = down_mid.get("key")
-        page_url = down_mid.get("page_url")
-        cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-        request.cookies = cookie_pool.get_cookie()
-        return request
-
-
-if __name__ == "__main__":
-    Details(redis_key="magp:details:cookie").start()

+ 0 - 117
Details/detail_firefox.py

@@ -1,117 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:
----------
-@author: 马国鹏
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from feapder.utils.tools import wechat_warning
-import execjs
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-
-
-
-class FirefoxDetails(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_firefox"},sort={"date":-1})
-            print(data_lsit)
-            for item in data_lsit:
-                print(item)
-                request_params = item.get("request_params")
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-
-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
-                                      deal_detail=item.get("deal_detail"),**request_params,
-                                      callback=eval(item.get("parse")),base_info=item,render=True,
-                                      render_time=item.get("render_time"))
-                self.to_db.delete(self.db_name,item)
-            break
-
-    def detail_get(self,request,response):
-        print(response.text)
-        items = request.item
-        # print(items)
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-        list_item.contenthtml = html
-        yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        if response is None:
-            code = 0
-        else:
-        	code = response.status_code
-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
-        if 200 <= code < 300:
-            err = 'analysis'
-        elif 300 <= code < 400:
-            err = 'download'
-        elif 400 <= code < 500:
-            err = 'download'
-        elif 500 <= code:
-            err = "servers"
-        else:
-            err = "timeout"
-        mgp = MgpListItem()
-        mgp.code = code
-        mgp.error = err
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key, items[key])
-        mgp.failed += 1
-        if mgp.pri is None:
-            mgp.pri = 0
-
-        if mgp.pri > 5:
-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info = f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **爬虫等级:** {mgp.pri}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-
-    def end_callback(self):
-        print("爬虫结束")
-        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
-    # def download_midware(self, request):
-    #     request.proxies = self.prox_pool.get()
-    #     return request
-
-
-if __name__ == "__main__":
-    FirefoxDetails(redis_key="magp:details:firefox").start()

+ 0 - 164
Details/details.py

@@ -1,164 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:
----------
-@author: 马国鹏
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import time
-from urllib.parse import urljoin
-
-import feapder
-from feapder.utils.tools import wechat_warning
-import execjs
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-from untils.attachment import AttachmentDownloader
-
-
-class Details(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details","failed":0},sort={"failed":1},limit=100)
-            for item in data_lsit:
-                print(item.get("item"))
-                request_params = item.get("request_params")
-                if item.get("js"):
-                    eval(item.get("js"))
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-                if item.get("proxies"):
-
-                    yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files_info=item.get("files"),
-                                          deal_detail=item.get("deal_detail"),
-                                          callback=eval(item.get("parse")),base_info=item,**request_params)
-                else:
-                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files_info=item.get("files"),
-                                          deal_detail=item.get("deal_detail"),
-                                          callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
-
-                self.to_db.delete(self.db_name,item)
-            break
-
-    def detail_get(self,request,response):
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-
-        list_item.contenthtml = html
-        if request.files_info:
-            files_info = request.files_info
-            files =  response.xpath(files_info.get("list_xpath"))
-            if len(files)>0:
-                attachments = {}
-                for index,info in enumerate(files):
-                    file_url = info.xpath(files_info.get("url_xpath")).extract_first()
-                    file_name = info.xpath(files_info.get("name_xpath")).extract_first()
-                    if files_info.get("host"):
-                        file_url = urljoin(files_info.get("host"), file_url)
-                    if not files_info.get("file_type"):
-                        file_type = file_url.split("?")[0].split(".")[-1].lower()
-                    elif files_info.get("file_type")=='file_name':
-                        file_type = file_name.split("?")[0].split(".")[-1].lower()
-                    else:
-                        file_type = files_info.get("file_type")
-                    if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
-                        attachment = AttachmentDownloader().fetch_attachment(
-                            file_name=file_name,file_type=file_type,download_url=file_url,
-                            enable_proxy=False)
-                        attachments[str(len(attachments)+1)] = attachment
-                if len(attachments)==0:
-                    pass
-                else:
-                    list_item.projectinfo={"attachments":attachments}
-
-        yield list_item
-
-    def detail_json(self,request,response):
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        exec(request.deal_detail)
-
-        yield list_item
-    def detail_post(self,request,response):
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        exec(request.deal_detail)
-
-        yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        if response is None:
-            code = 0
-        else:
-        	code = response.status_code
-        err_dic = {"200":"analysis","400":"download","500":"servers","300":"download"}
-        if 200<=code<300:
-            err = 'analysis'
-        elif 300<=code<400:
-            err = 'download'
-        elif 400<=code<500:
-            err = 'download'
-        elif 500<=code:
-            err = "servers"
-        else:
-            err = "timeout"
-        mgp = MgpListItem()
-        mgp.code=code
-        mgp.error=err
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key,items[key])
-        mgp.failed +=1
-        if mgp.pri is None:
-            mgp.pri = 0
-
-        if mgp.pri > 5:
-            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info= f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **爬虫等级:** {mgp.pri}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-    def end_callback(self):
-        print("爬虫结束")
-
-
-
-if __name__ == "__main__":
-    Details(redis_key="magp:details1").start()

+ 0 - 200
Details/details_webcookie.py

@@ -1,200 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:  生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip
----------
-@author: 马国鹏
-"""
-import sys
-from urllib.parse import urljoin
-
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from feapder.utils.tools import wechat_warning
-import execjs
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-from untils.attachment import AttachmentDownloader
-from untils.WebCookiePool import WebCookiePool
-import copy
-
-class Details(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_webcookie"},sort={"date":-1},limit=100)
-            for item in data_lsit:
-                request_params = item.get("request_params")
-
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-                if item.get("proxies"):
-
-                    yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files_info=item.get("files"),
-                                          deal_detail=item.get("deal_detail"),down_mid=item.get("down_mid"),
-                                          callback=eval(item.get("parse")),base_info=item,**request_params)
-                else:
-                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files_info=item.get("files"),
-                                          deal_detail=item.get("deal_detail"),down_mid=item.get("down_mid"),
-                                          callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
-
-                self.to_db.delete(self.db_name,item)
-            break
-
-
-
-    def detail_get(self,request,response):
-        '''处理html格式的返回结果'''
-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
-            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_key = down_mid.get("cookie_key")
-            cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        elif response.code in (request.down_mid.get("code")):
-            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_key = down_mid.get("cookie_key")
-            cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-
-        list_item.contenthtml = html
-        if request.files_info:
-            files_info = request.files_info
-            files =  response.xpath(files_info.get("list_xpath"))
-            if len(files)>0:
-                attachments = {}
-                for index,info in enumerate(files):
-                    file_url = info.xpath(files_info.get("url_xpath")).extract_first()
-                    file_name = info.xpath(files_info.get("name_xpath")).extract_first()
-                    if files_info.get("host"):
-                        file_url = urljoin(files_info.get("host"), file_url)
-                    if not files_info.get("file_type"):
-                        file_type = file_url.split("?")[0].split(".")[-1].lower()
-                    else:
-                        file_type = files_info.get("file_type")
-                    if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
-                        attachment = AttachmentDownloader().fetch_attachment(
-                            file_name=file_name,file_type=file_type,download_url=file_url,
-                            enable_proxy=False)
-                        attachments[str(len(attachments)+1)] = attachment
-                if len(attachments)==0:
-                    pass
-                else:
-                    list_item.projectinfo={"attachments":attachments}
-        yield list_item
-
-    def detail_json(self,request,response):
-        '''处理json串及其他格式的返回结果'''
-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
-            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            cookie_key = down_mid.get("cookie_key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        elif response.status_code in request.down_mid.get("code"):
-            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_key = down_mid.get("cookie_key")
-            cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        else:
-            items = request.item
-            list_item = DataBakItem()
-            for key in items:
-                list_item.__setitem__(key,items[key])
-            exec(request.deal_detail)
-            yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        if response is None:
-            code = 0
-        else:
-            code = response.status_code
-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
-        if 200 <= code < 300:
-            err = 'analysis'
-        elif 300 <= code < 400:
-            err = 'download'
-        elif 400 <= code < 500:
-            err = 'download'
-        elif 500 <= code:
-            err = "servers"
-        else:
-            err = "timeout"
-        mgp = MgpListItem()
-        mgp.code = code
-        mgp.error = err
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key, items[key])
-        mgp.failed += 1
-        if mgp.pri is None:
-            mgp.pri = 0
-
-        if mgp.pri > 5:
-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info = f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **爬虫等级:** {mgp.pri}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-
-    def end_callback(self):
-        print("爬虫结束")
-    def download_midware(self, request):
-        down_mid = request.down_mid
-        key = down_mid.get("key")
-        page_url = down_mid.get("page_url")
-        cookie_key = down_mid.get("cookie_key")
-        request.headers={"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"}
-        cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key)
-        request.cookies = cookie_pool.get_cookie()
-        return request
-
-
-if __name__ == "__main__":
-    Details(redis_key="magp:details_webcookie").start()

+ 0 - 15
FworkSpider/details/__init__.py

@@ -1,15 +0,0 @@
-import requests
-
-
-headers = {
-
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
-}
-cookies = {
-    "__jsluid_h": "018c23a4fee58c26aa118512640f8022"
-}
-url = "http://www.snszgh.gov.cn/gsgg/index.html"
-response = requests.get(url, headers=headers,verify=False)
-
-print(response.text)
-print(response)

+ 0 - 134
FworkSpider/details/detail_ztlbw.py

@@ -1,134 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:
----------
-@author: 马国鹏
-"""
-
-import feapder
-from feapder.utils.log import Log
-from feapder.utils.tools import wechat_warning
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-from login_pool.zglbw import ZglbwPool
-from untils.attachment import AttachmentDownloader
-
-Log().info("")
-
-
-class FirefoxDetails(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name, {"parser_name": "details_ztlbw", "item.spidercode": "a_ztlbsww_jzxtp"},
-                                        sort={"date": -1}, limit=1)
-            print(data_lsit)
-            for item in data_lsit:
-                url = item.get("parse_url")
-                url = "https://eproport.crecgec.com/#/notice/notice-detail?projectId=1484412339522916354&tenantId=1&indexnumber=0"
-                cookie = ZglbwPool(table_userbase='zglbw', redis_key='zglbw')
-                cookie = cookie.get_cookie().cookie
-                yield feapder.Request(url=url, item=item.get("item"),
-                                      callback=self.detail_get, base_info=item, render=True,
-                                      render_time=3, proxies=False, cookies=cookie)
-                self.to_db.delete(self.db_name, item)
-            break
-
-    def detail_get(self, request, response):
-        items = request.item
-        # print(items)
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key, items[key])
-        html = ''
-        xpath_list = ['//div[@class="ant-col ant-col-xs-6 ant-col-sm-6 ant-col-lg-12"][1]',
-                      '//div[@class="luban-bid-details ant-row ng-star-inserted"][2]',
-                      '//div[@class="login ng-star-inserted"]']
-        for xpath in xpath_list:
-            # import pdb
-            # pdb.set_trace()
-            html_one = response.xpath(xpath).extract_first()
-            if html_one is not None:
-                html += '\n'  # 标书详细内容
-                html += html_one  # 拼接html
-        print(html)
-        list_item.contenthtml = html
-        files_list = response.xpath("//iframe/@src").extract_first()
-        file_url = files_list.split("file=")[-1]
-        file_url = file_url.replace("%3A", ":").replace("%2F", "/").replace("%3F", "?").replace("%3D", "=")
-        attachments = {}
-        file_name = list_item.title
-
-        attachment = AttachmentDownloader().fetch_attachment(
-            file_name=file_name, file_type='pdf', download_url=file_url,
-            enable_proxy=False)
-        attachments["0"] = attachment
-        list_item.projectinfo = {"attachments": attachments}
-        yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        if response is None:
-            code = 0
-        code = response.status_code
-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
-        if 200 <= code < 300:
-            err = 'analysis'
-        elif 300 <= code < 400:
-            err = 'download'
-        elif 400 <= code < 500:
-            err = 'download'
-        elif 500 <= code:
-            err = "servers"
-        else:
-            err = "timeout"
-        mgp = MgpListItem()
-        mgp.code = code
-        mgp.error = err
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key, items[key])
-        mgp.failed += 1
-        if mgp.pri is None:
-            mgp.pri = 0
-
-        if mgp.pri > 5:
-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info = f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **爬虫等级:** {mgp.pri}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-    def end_callback(self):
-        print("爬虫结束")
-        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
-    # def download_midware(self, request):
-    #     request.proxies = self.prox_pool.get()
-    #     return request
-
-
-if __name__ == "__main__":
-    FirefoxDetails(redis_key="magp:details:ztlbw").start()

+ 0 - 170
FworkSpider/details/details.py

@@ -1,170 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:
----------
-@author: 马国鹏
-"""
-import json
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import time
-from urllib.parse import urljoin
-
-import feapder
-from feapder.utils.tools import wechat_warning
-import execjs
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-from untils.attachment import AttachmentDownloader
-
-
-class Details(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details"},sort={"item.publishtime":-1},limit=50)
-            for item in data_lsit:
-                print(11111)
-                request_params = item.get("request_params")
-                if item.get("js"):
-                    eval(item.get("js"))
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-                if item.get("proxies"):
-
-                    yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files=item.get("files"),
-                                          deal_detail=item.get("deal_detail"),
-                                          callback=eval(item.get("parse")),base_info=item,**request_params)
-                else:
-                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files=item.get("files"),
-                                          deal_detail=item.get("deal_detail"),
-                                          callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
-
-                self.to_db.delete(self.db_name,item)
-            break
-
-    def detail_get(self,request,response):
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-
-        list_item.contenthtml = html
-        if request.files:
-            files_info = request.files
-            files =  response.xpath(files_info.get("list_xpath"))
-            if request.files_info:
-                files_info = request.files_info
-                files = response.xpath(files_info.get("list_xpath"))
-                if request.files_info:
-                    files_info = request.files_info
-                    files = response.xpath(files_info.get("list_xpath"))
-                    if len(files) > 0:
-                        attachments = {}
-                        for index, info in enumerate(files):
-                            file_url = info.xpath(files_info.get("url_xpath")).extract_first()
-                            file_name = info.xpath(files_info.get("name_xpath")).extract_first()
-                            if files_info.get("host"):
-                                file_url = urljoin(files_info.get("host"), file_url)
-                            if not files_info.get("file_type"):
-                                file_type = file_url.split("?")[0].split(".")[-1].lower()
-                            else:
-                                file_type = files_info.get("file_type")
-                            if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
-                                attachment = AttachmentDownloader().fetch_attachment(
-                                    file_name=file_name, file_type=file_type, download_url=file_url,
-                                    enable_proxy=False)
-                                attachments[len(attachments) + 1] = attachment
-                        if len(attachments) == 0:
-                            pass
-                        else:
-                            list_item.projectinfo = {"attachment": attachments}
-
-
-        yield list_item
-
-    def detail_json(self,request,response):
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        exec(request.deal_detail)
-
-        yield list_item
-    def detail_post(self,request,response):
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        exec(request.deal_detail)
-
-        yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        if response is None:
-            code = 0
-        else:
-            code = response.status_code
-        err_dic = {"200":"analysis","400":"download","500":"servers","300":"download"}
-        if 200<=code<300:
-            err = 'analysis'
-        elif 300<=code<400:
-            err = 'download'
-        elif 400<=code<500:
-            err = 'download'
-        elif 500<=code:
-            err = "servers"
-        else:
-            err = "timeout"
-        mgp = MgpListItem()
-        mgp.code=code
-        mgp.error=err
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key,items[key])
-        mgp.failed +=1
-        if mgp.pri is None:
-            mgp.pri = 0
-
-        if mgp.pri > 5:
-            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info= f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **爬虫等级:** {mgp.pri}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-    def end_callback(self):
-        print("爬虫结束")
-
-
-
-if __name__ == "__main__":
-    Details(redis_key="magp:details1").start()

+ 0 - 165
FworkSpider/details/details_cookie.py

@@ -1,165 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:  生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip
----------
-@author: 马国鹏
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from feapder.utils.tools import wechat_warning
-import execjs
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-
-from untils.cookie_pool import PageCookiePool
-import copy
-
-class Details(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
-            for item in data_lsit:
-                request_params = item.get("request_params")
-
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-
-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
-                                      deal_detail=item.get("deal_detail"),**request_params,
-                                      callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
-                self.to_db.delete(self.db_name,item)
-            break
-
-
-
-    def detail_get(self,request,response):
-        '''处理html格式的返回结果'''
-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
-            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        if response.code in (request.down_mid.get("code")):
-            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-
-        list_item.contenthtml = html
-        yield list_item
-
-    def detail_json(self,request,response):
-        '''处理json串及其他格式的返回结果'''
-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
-            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        if response.code in (request.down_mid.get("code")):
-            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        exec(request.deal_detail)
-
-        list_item.contenthtml = html
-        yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        if response is None:
-            code = 0
-        else:
-            code = response.status_code
-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
-        if 200 <= code < 300:
-            err = 'analysis'
-        elif 300 <= code < 400:
-            err = 'download'
-        elif 400 <= code < 500:
-            err = 'download'
-        elif 500 <= code:
-            err = "servers"
-        else:
-            err = "timeout"
-        mgp = MgpListItem()
-        mgp.code = code
-        mgp.error = err
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key, items[key])
-        mgp.failed += 1
-        if mgp.pri is None:
-            mgp.pri = 0
-
-        if mgp.pri > 5:
-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info = f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **爬虫等级:** {mgp.pri}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-
-    def end_callback(self):
-        print("爬虫结束")
-    def download_midware(self, request):
-        down_mid = request.down_mid
-        key = down_mid.get("key")
-        page_url = down_mid.get("page_url")
-        cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-        request.cookies = cookie_pool.get_cookie()
-        return request
-
-
-if __name__ == "__main__":
-    Details(redis_key="magp:details1").start()

+ 0 - 115
FworkSpider/details/details_firefox.py

@@ -1,115 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:
----------
-@author: 马国鹏
-"""
-
-import feapder
-from feapder.utils.tools import wechat_warning
-import execjs
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-
-
-
-class FirefoxDetails(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_firefox"},sort={"date":-1})
-            print(data_lsit)
-            for item in data_lsit:
-                print(item)
-                request_params = item.get("request_params")
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-
-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
-                                      deal_detail=item.get("deal_detail"),**request_params,
-                                      callback=eval(item.get("parse")),base_info=item,render=True,
-                                      render_time=item.get("render_time"))
-                self.to_db.delete(self.db_name,item)
-            break
-
-    def detail_get(self,request,response):
-        print(response.text)
-        items = request.item
-        # print(items)
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-        list_item.contenthtml = html
-        yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        if response is None:
-            code = 0
-        code = response.status_code
-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
-        if 200 <= code < 300:
-            err = 'analysis'
-        elif 300 <= code < 400:
-            err = 'download'
-        elif 400 <= code < 500:
-            err = 'download'
-        elif 500 <= code:
-            err = "servers"
-        else:
-            err = "timeout"
-        mgp = MgpListItem()
-        mgp.code = code
-        mgp.error = err
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key, items[key])
-        mgp.failed += 1
-        if mgp.pri is None:
-            mgp.pri = 0
-
-        if mgp.pri > 5:
-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info = f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **爬虫等级:** {mgp.pri}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-
-    def end_callback(self):
-        print("爬虫结束")
-        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
-    # def download_midware(self, request):
-    #     request.proxies = self.prox_pool.get()
-    #     return request
-
-
-if __name__ == "__main__":
-    FirefoxDetails(redis_key="magp:details:firefox").start()

+ 0 - 150
FworkSpider/details/details_login.py

@@ -1,150 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:  生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip
----------
-@author: 马国鹏
-"""
-
-import feapder
-from feapder.utils.tools import wechat_warning
-import execjs
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-
-from untils.cookie_pool import LoginCookiePool
-import copy
-
-class Details(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
-            for item in data_lsit:
-                request_params = item.get("request_params")
-                down_mid = copy.copy(item.get("down_mid"))
-                key = down_mid.get("key")
-                page_url = down_mid.get("page_url")
-                cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-                down_mid["cookie_pool"] = cookie_pool
-                print(down_mid)
-
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-
-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
-                                      deal_detail=item.get("deal_detail"),**request_params,
-                                      callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
-                self.to_db.delete(self.db_name,item)
-            break
-
-
-
-    def detail_get(self,request,response):
-        '''处理html格式的返回结果'''
-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
-            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.get("down_mid"))
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        if response.code in (request.down_mid.get("code")):
-            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.get("down_mid"))
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-
-        list_item.contenthtml = html
-        yield list_item
-
-    def detail_json(self,request,response):
-        '''处理json串及其他格式的返回结果'''
-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
-            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.get("down_mid"))
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        if response.code in (request.down_mid.get("code")):
-            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.get("down_mid"))
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        exec(request.deal_detail)
-
-        list_item.contenthtml = html
-        yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        mgp = MgpListItem()
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key,items[key])
-        mgp.failed +=1
-        print(f'......{mgp.failed}')
-        if mgp.pri > 5:
-            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info= f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-
-    def end_callback(self):
-        print("爬虫结束")
-    def download_midware(self, request):
-        down_mid = request.down_mid
-        key = down_mid.get("key")
-        page_url = down_mid.get("page_url")
-        cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-        request.cookies = cookie_pool.get_cookie()
-        return request
-
-
-if __name__ == "__main__":
-    Details(redis_key="magp:details1").start()

+ 0 - 88
FworkSpider/details/dtcookie_pool.py

@@ -1,88 +0,0 @@
-import json
-import re
-import sys
-
-import execjs
-
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-from untils.cookie_pool import PageCookiePool
-import requests
-
-
-class DTCookiePool(PageCookiePool):
-    def __init__(self,redis_key,header,page_url=None,
-        min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs):
-        super(DTCookiePool, self).__init__(redis_key,page_url=None,
-        min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs)
-        self.headers=header
-        self.page_url = page_url
-
-    def create_cookie(self,):
-        session = requests.Session()
-        start_url = self.page_url
-        print(self.headers)
-        res = session.get(start_url, headers=self.headers,verify=False)
-        js_func = re.findall("document.cookie=(.*?)location.href", res.text)[0]
-        js_func = 'function sd() { return ' + js_func + "}"
-        ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
-        ss = ctx.call("sd")
-        cookies = {}
-
-        for item in ss.split(";"):
-            if '=' in item:
-                cookies[item.split("=")[0]] = item.split("=")[-1]
-        res = session.get(start_url, cookies=cookies, headers=self.headers)
-        js_do_data = re.findall('};go\((.*?)\)', res.text)[0]
-        js_func = re.sub("<(/*?)script>", "", res.text)
-        location = re.compile('location(.*?)}else')
-        setTimeout = re.compile('_(.{37})setTimeout(.*?)document')
-        setTimeout2 = re.compile('setTimeout(.*?)document')
-        gox = re.compile('};go(.*?)\)')
-        js_func = re.sub(location, "}else", js_func)
-        js_func = re.sub(setTimeout, "       document", js_func)
-        js_func = re.sub(setTimeout2, "       document", js_func)
-        js_func = re.sub(gox, "   return document['cookie']\n};", js_func)
-        js_func = '''const jsdom = require("jsdom");
-        const {JSDOM} = jsdom;
-        const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
-        window = dom.window;
-        document = window.document;''' + js_func
-        ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
-        with open('ex_js.js', 'w+', encoding='utf-8') as f:
-            f.write(js_func)
-        print(js_do_data)
-        ss = ctx.call("go", json.loads(js_do_data))
-
-        for item in ss.split(";"):
-            if '=' in item:
-                cookies[item.split("=")[0]] = item.split("=")[-1]
-                session.cookies.setdefault(item.split("=")[0], item.split("=")[-1])
-        res = session.get(start_url, headers=self.headers, cookies=cookies)
-        cookies = requests.utils.dict_from_cookiejar(session.cookies)
-        return cookies
-
-if __name__ == '__main__':
-    headers = {
-    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-    "Accept-Encoding": "gzip, deflate, br",
-    "Accept-Language": "zh-CN,zh;q=0.9",
-    "Cache-Control": "max-age=0",
-    "Connection": "keep-alive",
-    "Host": "www.hefei.gov.cn",
-    "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"96\", \"Google Chrome\";v=\"96\"",
-    "sec-ch-ua-mobile": "?0",
-    "sec-ch-ua-platform": "\"Windows\"",
-    "Sec-Fetch-Dest": "document",
-    "Sec-Fetch-Mode": "navigate",
-    "Sec-Fetch-Site": "none",
-    "Sec-Fetch-User": "?1",
-    "Upgrade-Insecure-Requests": "1",
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
-}
-
-    cookie_pool = DTCookiePool(
-        page_url='https://www.hefei.gov.cn/public/column/5921?catId=6721141&nav=3&action=list&type=4&pageIndex=2',
-        header=headers, redis_key="dongtaices")
-    cookie = cookie_pool.get_cookie()
-    print(cookie)
-    # cookie_pool.del_cookie(cookie)

文件差异内容过多而无法显示
+ 0 - 1
FworkSpider/details/file/sj.js


+ 29 - 22
FworkSpider/feapder/buffer/item_buffer.py

@@ -99,9 +99,9 @@ class ItemBuffer(threading.Thread):
 
 
         return self._mysql_pipeline
         return self._mysql_pipeline
 
 
-    def run(self):
+    def run(self): # step 1 开始
         self._thread_stop = False
         self._thread_stop = False
-        while not self._thread_stop:
+        while not self._thread_stop: # 爬虫不停止,就一直循环刷新
             self.flush()
             self.flush()
             tools.delay_time(1)
             tools.delay_time(1)
 
 
@@ -111,13 +111,18 @@ class ItemBuffer(threading.Thread):
         self._thread_stop = True
         self._thread_stop = True
         self._started.clear()
         self._started.clear()
 
 
-    def put_item(self, item):
+    def put_item(self, item): # step 存储数据的入口 将需要存储的数据放入数据管道队列
         if isinstance(item, Item):
         if isinstance(item, Item):
             # 入库前的回调
             # 入库前的回调
-            item.pre_to_db()
-
-        self._items_queue.put(item)
 
 
+            if item.item_name == "ListItem":  # 测试框架有用,对listitem不进行存储,正式框架没有这个判断
+                return
+            item.pre_to_db()
+            # print(item)
+            if item.save: # 根据save字段,判断该条信息是否存储
+                self._items_queue.put(item)
+        else:
+            self._items_queue.put(item)
     def flush(self):
     def flush(self):
         try:
         try:
             items = []
             items = []
@@ -127,26 +132,26 @@ class ItemBuffer(threading.Thread):
             items_fingerprints = []
             items_fingerprints = []
             data_count = 0
             data_count = 0
 
 
-            while not self._items_queue.empty():
-                data = self._items_queue.get_nowait()
+            while not self._items_queue.empty(): # step 2 数据管道队列不为空时时 不等待直接取值
+                data = self._items_queue.get_nowait() # 队列的 不等待直接取值方法,类似get
                 data_count += 1
                 data_count += 1
 
 
                 # data 分类
                 # data 分类
                 if callable(data):
                 if callable(data):
                     callbacks.append(data)
                     callbacks.append(data)
 
 
-                elif isinstance(data, UpdateItem):
+                elif isinstance(data, UpdateItem): # 更新型数据,走更新管道,采集框架只存不更新,可以忽略不看
                     update_items.append(data)
                     update_items.append(data)
 
 
                 elif isinstance(data, Item):
                 elif isinstance(data, Item):
                     items.append(data)
                     items.append(data)
-                    if setting.ITEM_FILTER_ENABLE:
+                    if setting.ITEM_FILTER_ENABLE: # item去重,对于当前框架,无效,不看
                         items_fingerprints.append(data.fingerprint)
                         items_fingerprints.append(data.fingerprint)
 
 
                 else:  # request-redis
                 else:  # request-redis
                     requests.append(data)
                     requests.append(data)
 
 
-                if data_count >= UPLOAD_BATCH_MAX_SIZE:
+                if data_count >= UPLOAD_BATCH_MAX_SIZE: # step 3 需要存储的数据,达到一定数量后,统一存储
                     self.__add_item_to_db(
                     self.__add_item_to_db(
                         items, update_items, requests, callbacks, items_fingerprints
                         items, update_items, requests, callbacks, items_fingerprints
                     )
                     )
@@ -158,7 +163,7 @@ class ItemBuffer(threading.Thread):
                     items_fingerprints = []
                     items_fingerprints = []
                     data_count = 0
                     data_count = 0
 
 
-            if data_count:
+            if data_count: # step 3 管道为空后,将剩余的数据,统一存储
                 self.__add_item_to_db(
                 self.__add_item_to_db(
                     items, update_items, requests, callbacks, items_fingerprints
                     items, update_items, requests, callbacks, items_fingerprints
                 )
                 )
@@ -243,11 +248,11 @@ class ItemBuffer(threading.Thread):
         return datas_dict
         return datas_dict
 
 
     def __export_to_db(self, table, datas, is_update=False, update_keys=()):
     def __export_to_db(self, table, datas, is_update=False, update_keys=()):
-        # 打点 校验
+        # step 3.1.1 打点 记录总条数及每个key情况
         self.check_datas(table=table, datas=datas)
         self.check_datas(table=table, datas=datas)
 
 
-        for pipeline in self._pipelines:
-            if is_update:
+        for pipeline in self._pipelines: # setting 配置的piplines方法
+            if is_update: # 更新方法 不看
                 if table == self._task_table and not isinstance(
                 if table == self._task_table and not isinstance(
                     pipeline, MysqlPipeline
                     pipeline, MysqlPipeline
                 ):
                 ):
@@ -260,7 +265,7 @@ class ItemBuffer(threading.Thread):
                     return False
                     return False
 
 
             else:
             else:
-                if not pipeline.save_items(table, datas):
+                if not pipeline.save_items(table, datas): # step 3.1.2 调用pipline的 save_items 方法
                     log.error(
                     log.error(
                         f"{pipeline.__class__.__name__} 保存数据失败. table: {table}  items: {datas}"
                         f"{pipeline.__class__.__name__} 保存数据失败. table: {table}  items: {datas}"
                     )
                     )
@@ -284,11 +289,11 @@ class ItemBuffer(threading.Thread):
         export_success = True
         export_success = True
         self._is_adding_to_db = True
         self._is_adding_to_db = True
 
 
-        # 去重
+        # 去重 item去重,不看
         if setting.ITEM_FILTER_ENABLE:
         if setting.ITEM_FILTER_ENABLE:
             items, items_fingerprints = self.__dedup_items(items, items_fingerprints)
             items, items_fingerprints = self.__dedup_items(items, items_fingerprints)
 
 
-        # 分捡
+        # step 分捡 将每个表之间的数据分开 拆分后 原items为空
         items_dict = self.__pick_items(items)
         items_dict = self.__pick_items(items)
         update_items_dict = self.__pick_items(update_items, is_update_item=True)
         update_items_dict = self.__pick_items(update_items, is_update_item=True)
 
 
@@ -306,7 +311,7 @@ class ItemBuffer(threading.Thread):
                 % (table, tools.dumps_json(datas, indent=16))
                 % (table, tools.dumps_json(datas, indent=16))
             )
             )
 
 
-            if not self.__export_to_db(table, datas):
+            if not self.__export_to_db(table, datas): # step 3.1 导出到数据库
                 export_success = False
                 export_success = False
                 failed_items["add"].append({"table": table, "datas": datas})
                 failed_items["add"].append({"table": table, "datas": datas})
 
 
@@ -331,7 +336,7 @@ class ItemBuffer(threading.Thread):
                 failed_items["update"].append({"table": table, "datas": datas})
                 failed_items["update"].append({"table": table, "datas": datas})
 
 
         if export_success:
         if export_success:
-            # 执行回调
+            # step 3.2 保存成功后,执行的执行回调
             while callbacks:
             while callbacks:
                 try:
                 try:
                     callback = callbacks.pop(0)
                     callback = callbacks.pop(0)
@@ -339,15 +344,17 @@ class ItemBuffer(threading.Thread):
                 except Exception as e:
                 except Exception as e:
                     log.exception(e)
                     log.exception(e)
 
 
-            # 删除做过的request
+            # step 删除做过的request
             if requests:
             if requests:
                 self.redis_db.zrem(self._table_request, requests)
                 self.redis_db.zrem(self._table_request, requests)
 
 
-            # 去重入库
+            # 去重入库 不走这个去重
             if setting.ITEM_FILTER_ENABLE:
             if setting.ITEM_FILTER_ENABLE:
                 if items_fingerprints:
                 if items_fingerprints:
                     self.__class__.dedup.add(items_fingerprints, skip_check=True)
                     self.__class__.dedup.add(items_fingerprints, skip_check=True)
         else:
         else:
+            # step 3.2 保存失败后,执行的执行回调
+
             failed_items["requests"] = requests
             failed_items["requests"] = requests
 
 
             if self.export_retry_times > setting.EXPORT_DATA_MAX_RETRY_TIMES:
             if self.export_retry_times > setting.EXPORT_DATA_MAX_RETRY_TIMES:

+ 3 - 3
FworkSpider/feapder/buffer/request_buffer.py

@@ -44,9 +44,9 @@ class RequestBuffer(threading.Thread):
                     name=redis_key, to_md5=False, **setting.REQUEST_FILTER_SETTING
                     name=redis_key, to_md5=False, **setting.REQUEST_FILTER_SETTING
                 )  # 默认过期时间为一个月
                 )  # 默认过期时间为一个月
 
 
-    def run(self):
+    def run(self): # step 1 线程入口
         self._thread_stop = False
         self._thread_stop = False
-        while not self._thread_stop:
+        while not self._thread_stop: # 每隔一分钟进行一次 将产生的任务存储
             try:
             try:
                 self.__add_request_to_db()
                 self.__add_request_to_db()
             except Exception as e:
             except Exception as e:
@@ -94,7 +94,7 @@ class RequestBuffer(threading.Thread):
         callbacks = []
         callbacks = []
 
 
         while self._requests_deque:
         while self._requests_deque:
-            request = self._requests_deque.popleft()
+            request = self._requests_deque.popleft() # 从任务队列中从左取任务(先进先出)
             self._is_adding_to_db = True
             self._is_adding_to_db = True
 
 
             if callable(request):
             if callable(request):

+ 1 - 1
FworkSpider/feapder/commands/create_builder.py

@@ -20,7 +20,7 @@ def main():
         "-p", "--project", help="创建项目 如 feapder create -p <project_name>", metavar=""
         "-p", "--project", help="创建项目 如 feapder create -p <project_name>", metavar=""
     )
     )
     spider.add_argument(
     spider.add_argument(
-        "-s",
+        "--s",
         "--spider",
         "--spider",
         nargs="+",
         nargs="+",
         help="创建爬虫\n"
         help="创建爬虫\n"

+ 33 - 0
FworkSpider/feapder/core/base_parser.py

@@ -9,6 +9,8 @@ Created on 2018-07-25 11:41:57
 """
 """
 import os
 import os
 import traceback
 import traceback
+
+import feapder
 import feapder.utils.tools as tools
 import feapder.utils.tools as tools
 from feapder.db.mysqldb import MysqlDB
 from feapder.db.mysqldb import MysqlDB
 from feapder.network.item import UpdateItem
 from feapder.network.item import UpdateItem
@@ -89,12 +91,43 @@ class BaseParser(object):
         """
         """
 
 
         pass
         pass
+    def infinite_crawl(self,request,response):
+        menu = request.item
+        list_item = request.list_item
+        if self.platform_next_page:  # real_page为连续翻页采集为0
+            if getattr(request, 'real_page', None) is not None:
+                request.real_page = 0
+
+            request.real_page += 1
+            if list_item.rel_count > 0:
+                request.real_page = 0
+
+            if request.real_page <= 5 and request.page < self.platform_max_page:
+                request.page += 1
+                request.callback = self.parse
+                if getattr(request, 'new_callback', None) is not None:
+                    request.callback = eval(request.new_callback)
+                    yield request
+        else:
+            if request.page < menu.get("crawl_page"):
+                request.page += 1
+                request.callback = self.parse
+                if getattr(request, 'new_callback', None) is not None:
+                    request.callback = eval(request.new_callback)
+                    yield request
+
     def push_files(self, request, response):
     def push_files(self, request, response):
         """
         """
         @summary: 下载 并上传附件文件,传进来的request的auto_request必须为False,否则可能会因为响应失败而无法下载文件
         @summary: 下载 并上传附件文件,传进来的request的auto_request必须为False,否则可能会因为响应失败而无法下载文件
         ---------
         ---------
         @param request:  request.url 为文件下载地址, 该方法需要自行调用
         @param request:  request.url 为文件下载地址, 该方法需要自行调用
         request.INFO  为上传文件时所需要提供的部分参数  必传
         request.INFO  为上传文件时所需要提供的部分参数  必传
+         info = {
+            "org_url": "http://www...",  # 文件下载连接
+            "filename": f"{list_item.title}.docx",  # 文件名
+            "channel": list_item.channel,
+            "ftype": 'docx,zip,ftp', # 文件类型
+        }
         request.headers 则存放请求的必要参数,如:parmas,headers  必传
         request.headers 则存放请求的必要参数,如:parmas,headers  必传
         ---------
         ---------
         @result: request / item / callback / None (返回值必须可迭代),正常处理为 None 即可
         @result: request / item / callback / None (返回值必须可迭代),正常处理为 None 即可

+ 6 - 5
FworkSpider/feapder/core/collector.py

@@ -48,11 +48,11 @@ class Collector(threading.Thread):
 
 
         self.__delete_dead_node()
         self.__delete_dead_node()
 
 
-    def run(self):
+    def run(self):  # step 线程入口
         self._thread_stop = False
         self._thread_stop = False
         while not self._thread_stop:
         while not self._thread_stop:
             try:
             try:
-                self.__report_node_heartbeat()
+                self.__report_node_heartbeat() # step 汇报节点心跳
                 self.__input_data()
                 self.__input_data()
             except Exception as e:
             except Exception as e:
                 log.exception(e)
                 log.exception(e)
@@ -67,23 +67,24 @@ class Collector(threading.Thread):
 
 
     def __input_data(self):
     def __input_data(self):
         current_timestamp = tools.get_current_timestamp()
         current_timestamp = tools.get_current_timestamp()
-        if len(self._todo_requests) >= self._request_count:
+        if len(self._todo_requests) >= self._request_count: # step 待执行任务数量>设置的任务数量上限 不处理
             return
             return
 
 
         request_count = self._request_count  # 先赋值
         request_count = self._request_count  # 先赋值
-        # 查询最近有心跳的节点数量
+        # step 查询最近有心跳的节点数量
         spider_count = self._db.zget_count(
         spider_count = self._db.zget_count(
             self._tab_spider_status,
             self._tab_spider_status,
             priority_min=current_timestamp - (self._interval + 10),
             priority_min=current_timestamp - (self._interval + 10),
             priority_max=current_timestamp,
             priority_max=current_timestamp,
         )
         )
-        # 根据等待节点数量,动态分配request
+        # step 根据等待节点数量,动态分配request
         if spider_count:
         if spider_count:
             # 任务数量
             # 任务数量
             task_count = self._db.zget_count(self._tab_requests)
             task_count = self._db.zget_count(self._tab_requests)
             # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1
             # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1
             request_count = task_count // spider_count + 1
             request_count = task_count // spider_count + 1
 
 
+        # step 判断 request_count 数量是否大于 设置的上限 ,大于上限,重置
         request_count = (
         request_count = (
             request_count
             request_count
             if request_count <= self._request_count
             if request_count <= self._request_count

+ 40 - 25
FworkSpider/feapder/core/parser_control.py

@@ -46,11 +46,11 @@ class PaserControl(threading.Thread):
 
 
         self._wait_task_time = 0
         self._wait_task_time = 0
 
 
-    def run(self):
+    def run(self):  # step 1 开始
         self._thread_stop = False
         self._thread_stop = False
         while not self._thread_stop:
         while not self._thread_stop:
             try:
             try:
-                requests = self._collector.get_requests(setting.SPIDER_TASK_COUNT)
+                requests = self._collector.get_requests(setting.SPIDER_TASK_COUNT) # step 2 获取任务
                 if not requests:
                 if not requests:
                     if not self.is_show_tip:
                     if not self.is_show_tip:
                         log.debug("parser 等待任务...")
                         log.debug("parser 等待任务...")
@@ -63,7 +63,7 @@ class PaserControl(threading.Thread):
                     continue
                     continue
 
 
                 self.is_show_tip = False
                 self.is_show_tip = False
-                self.deal_requests(requests)
+                self.deal_requests(requests) # step 3 开始处理任务
 
 
             except Exception as e:
             except Exception as e:
                 log.exception(e)
                 log.exception(e)
@@ -90,17 +90,17 @@ class PaserControl(threading.Thread):
                 if parser.name == request.parser_name:
                 if parser.name == request.parser_name:
                     used_download_midware_enable = False
                     used_download_midware_enable = False
                     try:
                     try:
-                        # 记录需下载的文档
+                        # step 4 记录需下载的文档
                         self.record_download_status(
                         self.record_download_status(
                             PaserControl.DOWNLOAD_TOTAL, parser.name
                             PaserControl.DOWNLOAD_TOTAL, parser.name
                         )
                         )
 
 
-                        # 解析request
+                        # step 5 解析request
                         if request.auto_request:
                         if request.auto_request:
                             request_temp = None
                             request_temp = None
                             response = None
                             response = None
 
 
-                            # 下载中间件
+                            # step 6 运行下载中间件 分两种,一种爬虫自定义的中间件,一种通过request传过来的中间件方法
                             if request.download_midware:
                             if request.download_midware:
                                 if isinstance(request.download_midware, (list, tuple)):
                                 if isinstance(request.download_midware, (list, tuple)):
                                     request_temp = request
                                     request_temp = request
@@ -122,10 +122,10 @@ class PaserControl(threading.Thread):
                                         )
                                         )
                                     )
                                     )
                                     request_temp = download_midware(request)
                                     request_temp = download_midware(request)
-                            elif request.download_midware != False:
+                            elif request.download_midware != False: # nunder 没理解应用场景
                                 request_temp = parser.download_midware(request)
                                 request_temp = parser.download_midware(request)
 
 
-                            # 请求
+                            # step 7 开始处理请求
                             if request_temp:
                             if request_temp:
                                 if (
                                 if (
                                     isinstance(request_temp, (tuple, list))
                                     isinstance(request_temp, (tuple, list))
@@ -150,10 +150,14 @@ class PaserControl(threading.Thread):
                                             )
                                             )
                                         )
                                         )
                                     except Exception as e:
                                     except Exception as e:
-                                        log.info("requests", extra={"url": request.url or request_temp.url, "code": -1,"error_info":e})
+                                        response = None
+                                        log.info("requests", extra={"url": request.url or request_temp.url,"code": -1,"error_info":e})
                                         raise Exception(
                                         raise Exception(
-                                            "连接超时 url: %s" % (request.url or request_temp.url)
+                                            "request 请求异常: %s url: %s" % (e,request.url or request_temp.url)
                                         )
                                         )
+                                    except:
+                                        response = None
+                                        log.error("request 请求异常 url: %s" % (request.url or request_temp.url))
 
 
                             else:
                             else:
                                 try:
                                 try:
@@ -165,23 +169,31 @@ class PaserControl(threading.Thread):
                                         )
                                         )
                                     )
                                     )
                                 except Exception as e:
                                 except Exception as e:
+                                    response = None
                                     log.info("requests", extra={"url": request.url or request_temp.url, "code": -1, "error_info": e})
                                     log.info("requests", extra={"url": request.url or request_temp.url, "code": -1, "error_info": e})
                                     raise Exception(
                                     raise Exception(
-                                        "连接超时 url: %s" % (request.url or request_temp.url)
+                                        "request 请求异常:%s url: %s" % (e,request.url or request_temp.url)
                                     )
                                     )
+                                except:
+                                    response = None
+                                    log.error("request 请求异常 url: %s" % (request.url or request_temp.url))
+                                    # raise Exception(
+                                    #     "response 请求异常 url: %s" % (request.url or request_temp.url))
+
 
 
                             if response == None:
                             if response == None:
                                 raise Exception(
                                 raise Exception(
-                                    "连接超时 url: %s" % (request.url or request_temp.url)
+                                    "request 请求异常,无法定位错误信息 url: %s" % (request.url or request_temp.url)
                                 )
                                 )
 
 
                         else:
                         else:
                             response = None
                             response = None
 
 
-                        # 校验
+                        # step 8 校验response 可以脚本自定义
+                        #  TODO 针对登录网站、动态cookie,可以考虑使用上 validate 方法,在 validate 方法中判断cookie是否正常
                         if parser.validate(request, response) == False:
                         if parser.validate(request, response) == False:
                             continue
                             continue
-
+                        #  step 9 走回调方法 如果有parser的回调函数,则用回调处理,否则默认用parser处理
                         if request.callback:  # 如果有parser的回调函数,则用回调处理
                         if request.callback:  # 如果有parser的回调函数,则用回调处理
                             callback_parser = (
                             callback_parser = (
                                 request.callback
                                 request.callback
@@ -198,9 +210,9 @@ class PaserControl(threading.Thread):
                                 % (parser.name, request.callback or "parse")
                                 % (parser.name, request.callback or "parse")
                             )
                             )
 
 
-                        # 标识上一个result是什么
+                        # step 标识上一个result是什么
                         result_type = 0  # 0\1\2 (初始值\request\item)
                         result_type = 0  # 0\1\2 (初始值\request\item)
-                        # 此处判断是request 还是 item
+                        #  step 10 判断 result 是request 还是 item
                         for result in results or []:
                         for result in results or []:
                             if isinstance(result, Request):
                             if isinstance(result, Request):
                                 result_type = 1
                                 result_type = 1
@@ -208,28 +220,28 @@ class PaserControl(threading.Thread):
                                 result.parser_name = result.parser_name or parser.name
                                 result.parser_name = result.parser_name or parser.name
 
 
                                 # 判断是同步的callback还是异步的
                                 # 判断是同步的callback还是异步的
-                                if result.request_sync:  # 同步
+                                if result.request_sync:  # 同步 就到此为止了
                                     request_dict = {
                                     request_dict = {
                                         "request_obj": result,
                                         "request_obj": result,
                                         "request_redis": None,
                                         "request_redis": None,
                                     }
                                     }
                                     requests.append(request_dict)
                                     requests.append(request_dict)
                                 else:  # 异步
                                 else:  # 异步
-                                    # 将next_request 入库
+                                    # step 10.1 将next_request 入库 进行下一个循环
                                     self._request_buffer.put_request(result)
                                     self._request_buffer.put_request(result)
                                     del_request_redis_after_request_to_db = True
                                     del_request_redis_after_request_to_db = True
 
 
                             elif isinstance(result, Item):
                             elif isinstance(result, Item):
                                 result_type = 2
                                 result_type = 2
-                                # 将item入库
+                                # step 10.1 将item入库
                                 self._item_buffer.put_item(result)
                                 self._item_buffer.put_item(result)
-                                # 需删除正在做的request
+                                # step 10.2 需删除正在做的request
                                 del_request_redis_after_item_to_db = True
                                 del_request_redis_after_item_to_db = True
 
 
-                            elif callable(result):  # result为可执行的无参函数
+                            elif callable(result):  # nunder result为可执行的无参函数
                                 if (
                                 if (
                                     result_type == 2
                                     result_type == 2
-                                ):  # item 的 callback,buffer里的item均入库后再执行
+                                ):  # step 10.1 item 的 callback,buffer里的item均入库后再执行
                                     self._item_buffer.put_item(result)
                                     self._item_buffer.put_item(result)
                                     del_request_redis_after_item_to_db = True
                                     del_request_redis_after_item_to_db = True
 
 
@@ -393,7 +405,7 @@ class PaserControl(threading.Thread):
                                 del_request_redis_after_request_to_db = True
                                 del_request_redis_after_request_to_db = True
 
 
                     else:
                     else:
-                        # 记录下载成功的文档
+                        # step 11 记录下载成功的文档
                         self.record_download_status(
                         self.record_download_status(
                             PaserControl.DOWNLOAD_SUCCESS, parser.name
                             PaserControl.DOWNLOAD_SUCCESS, parser.name
                         )
                         )
@@ -425,7 +437,7 @@ class PaserControl(threading.Thread):
                 else:
                 else:
                     self._request_buffer.put_del_request(request_redis)
                     self._request_buffer.put_del_request(request_redis)
 
 
-        if setting.SPIDER_SLEEP_TIME:
+        if setting.SPIDER_SLEEP_TIME: # 如果设置了取任务的休眠时间,这里会稍微等待一下
             if (
             if (
                 isinstance(setting.SPIDER_SLEEP_TIME, (tuple, list))
                 isinstance(setting.SPIDER_SLEEP_TIME, (tuple, list))
                 and len(setting.SPIDER_SLEEP_TIME) == 2
                 and len(setting.SPIDER_SLEEP_TIME) == 2
@@ -559,8 +571,11 @@ class AirSpiderParserControl(PaserControl):
                                 except Exception as e:
                                 except Exception as e:
                                     log.info("requests", extra={"url": request.url or request_temp.url, "code": -1, "error_info": e})
                                     log.info("requests", extra={"url": request.url or request_temp.url, "code": -1, "error_info": e})
                                     raise Exception(
                                     raise Exception(
-                                        "连接超时 url: %s" % (request.url or request_temp.url)
+                                        "565 连接超时 url: %s" % (request.url or request_temp.url)
                                     )
                                     )
+                                except:
+                                    raise Exception(
+                                        "response 请求超时 url: %s" % (request.url or request_temp.url))
 
 
                         else:
                         else:
                             response = None
                             response = None

+ 62 - 45
FworkSpider/feapder/core/scheduler.py

@@ -7,10 +7,13 @@ Created on 2017-01-09 10:38
 @author: Boris
 @author: Boris
 @email: boris_liu@foxmail.com
 @email: boris_liu@foxmail.com
 """
 """
+import json
+import sys
 import threading
 import threading
 import time
 import time
 from collections import Iterable
 from collections import Iterable
 
 
+
 import feapder.setting as setting
 import feapder.setting as setting
 import feapder.utils.tools as tools
 import feapder.utils.tools as tools
 from feapder.buffer.item_buffer import ItemBuffer
 from feapder.buffer.item_buffer import ItemBuffer
@@ -30,7 +33,9 @@ SPIDER_START_TIME_KEY = "spider_start_time"
 SPIDER_END_TIME_KEY = "spider_end_time"
 SPIDER_END_TIME_KEY = "spider_end_time"
 SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY = "last_task_count_record_time"
 SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY = "last_task_count_record_time"
 
 
-
+class Obj(object):
+    def __init__(self, dict_):
+        self.__dict__.update(dict_)
 class Scheduler(threading.Thread):
 class Scheduler(threading.Thread):
     __custom_setting__ = {}
     __custom_setting__ = {}
 
 
@@ -96,6 +101,7 @@ class Scheduler(threading.Thread):
         if "auto_stop_when_spider_done" in kwargs:
         if "auto_stop_when_spider_done" in kwargs:
             self._keep_alive = not kwargs.get("auto_stop_when_spider_done")
             self._keep_alive = not kwargs.get("auto_stop_when_spider_done")
         else:
         else:
+
             self._keep_alive = (
             self._keep_alive = (
                 keep_alive if keep_alive is not None else setting.KEEP_ALIVE
                 keep_alive if keep_alive is not None else setting.KEEP_ALIVE
             )
             )
@@ -164,18 +170,18 @@ class Scheduler(threading.Thread):
         else:
         else:
             raise ValueError("类型错误,爬虫需继承feapder.BaseParser或feapder.BatchParser")
             raise ValueError("类型错误,爬虫需继承feapder.BaseParser或feapder.BatchParser")
 
 
-    def run(self):
-        if not self.is_reach_next_spider_time():
+    def run(self):  # STEP 1 爬虫框架入口
+        if not self.is_reach_next_spider_time(): # STEP 2 检测爬虫是否到达执行时间
             return
             return
 
 
-        self._start()
+        self._start() # STEP 3 开始运行爬虫
 
 
-        while True:
+        while True: # step 4 对爬虫状态的一个监控
             try:
             try:
-                if self.all_thread_is_done():
+                if self.all_thread_is_done(): # Step 5 判断爬虫是否运行完成
                     if not self._is_notify_end:
                     if not self._is_notify_end:
                         self.spider_end()  # 跑完一轮
                         self.spider_end()  # 跑完一轮
-                        self.record_spider_state(
+                        self.record_spider_state(  # step 6 应该是一个通知爬虫结束的方法
                             spider_type=1,
                             spider_type=1,
                             state=1,
                             state=1,
                             spider_end_time=tools.get_current_date(),
                             spider_end_time=tools.get_current_date(),
@@ -184,14 +190,14 @@ class Scheduler(threading.Thread):
 
 
                         self._is_notify_end = True
                         self._is_notify_end = True
 
 
-                    if not self._keep_alive:
+                    if not self._keep_alive: # step 7 如果不是常驻爬虫 停止所有线程
                         self._stop_all_thread()
                         self._stop_all_thread()
                         break
                         break
 
 
                 else:
                 else:
                     self._is_notify_end = False
                     self._is_notify_end = False
 
 
-                self.check_task_status()
+                self.check_task_status() # step 8 检查任务状态,并进行告警通知
 
 
             except Exception as e:
             except Exception as e:
                 log.exception(e)
                 log.exception(e)
@@ -221,13 +227,13 @@ class Scheduler(threading.Thread):
                     raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
                     raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
 
 
                 result_type = 1
                 result_type = 1
-                for result in results or []:
-                    if isinstance(result, Request):
+                for result in results or []: # step 对yield 的数据进行判断处理
+                    if isinstance(result, Request): # Request 加入到任务队列
                         result.parser_name = result.parser_name or parser.name
                         result.parser_name = result.parser_name or parser.name
                         self._request_buffer.put_request(result)
                         self._request_buffer.put_request(result)
                         result_type = 1
                         result_type = 1
 
 
-                    elif isinstance(result, Item):
+                    elif isinstance(result, Item): # Item 数据,存入到数据管道队列,等待存储
                         self._item_buffer.put_item(result)
                         self._item_buffer.put_item(result)
                         result_type = 2
                         result_type = 2
 
 
@@ -247,15 +253,16 @@ class Scheduler(threading.Thread):
                 self._item_buffer.flush()
                 self._item_buffer.flush()
 
 
     def _start(self):
     def _start(self):
-        # 启动request_buffer
-        self._request_buffer.start()
-        # 启动item_buffer
-        self._item_buffer.start()
-        # 启动collector
-        self._collector.start()
+
+        self._request_buffer.start()  # STEP 3.1 启动request_buffer -- 任务管理器, 负责缓冲添加到数据库中的request
+
+        self._item_buffer.start()  # STEP 3.2 启动item_buffer -- 管道管理器 责缓冲添加到数据库中的item, 由该manager统一添加。防止多线程同时访问数据库
+
+        self._collector.start()  # STEP 3.3 启动collector  -- 任务管理 ,根据节点和任务,平均分配给每个节点
 
 
         # 启动parser control
         # 启动parser control
         for i in range(self._thread_count):
         for i in range(self._thread_count):
+            # STEP 3.4 根据 任务管理器、redis_key,下载器,数据管道创建一个线程池
             parser_control = self._parser_control_obj(
             parser_control = self._parser_control_obj(
                 self._collector,
                 self._collector,
                 self._redis_key,
                 self._redis_key,
@@ -263,22 +270,22 @@ class Scheduler(threading.Thread):
                 self._item_buffer,
                 self._item_buffer,
             )
             )
 
 
-            for parser in self._parsers:
+            for parser in self._parsers:  # step 3.5 把所有任务放入线程池
                 parser_control.add_parser(parser)
                 parser_control.add_parser(parser)
 
 
-            parser_control.start()
+            parser_control.start()  # STEP 3.6 根据线程池开辟一个线程
             self._parser_controls.append(parser_control)
             self._parser_controls.append(parser_control)
 
 
-        # 下发任务 因为时间可能比较长,放到最后面
+        # STEP 3.7下发任务 有消费线程之后开始读取任务
         if setting.RETRY_FAILED_REQUESTS:
         if setting.RETRY_FAILED_REQUESTS:
             # 重设失败的任务, 不用加锁,原子性操作
             # 重设失败的任务, 不用加锁,原子性操作
             handle_failed_requests = HandleFailedRequests(self._redis_key)
             handle_failed_requests = HandleFailedRequests(self._redis_key)
             handle_failed_requests.reput_failed_requests_to_requests()
             handle_failed_requests.reput_failed_requests_to_requests()
 
 
-        # 下发新任务
+        # STEP 3.8下发新任务 ,生产新任务
         if self._auto_start_requests:  # 自动下发
         if self._auto_start_requests:  # 自动下发
             if self.wait_lock:
             if self.wait_lock:
-                # 将添加任务处加锁,防止多进程之间添加重复的任务
+                # Stress 将添加任务处加锁,防止多进程之间添加重复的任务
                 with RedisLock(key=self._spider_name) as lock:
                 with RedisLock(key=self._spider_name) as lock:
                     if lock.locked:
                     if lock.locked:
                         self.__add_task()
                         self.__add_task()
@@ -286,34 +293,34 @@ class Scheduler(threading.Thread):
                 self.__add_task()
                 self.__add_task()
 
 
     def all_thread_is_done(self):
     def all_thread_is_done(self):
-        for i in range(3):  # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
-            # 检测 collector 状态
+        for i in range(3):  # Stress 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
+            # STEP 5.1 检测 collector 状态
             if (
             if (
                 self._collector.is_collector_task()
                 self._collector.is_collector_task()
                 or self._collector.get_requests_count() > 0
                 or self._collector.get_requests_count() > 0
             ):
             ):
                 return False
                 return False
 
 
-            # 检测 parser_control 状态
+            # STEP 5.2 检测 parser_control 状态
             for parser_control in self._parser_controls:
             for parser_control in self._parser_controls:
                 if not parser_control.is_not_task():
                 if not parser_control.is_not_task():
                     return False
                     return False
 
 
-            # 检测 item_buffer 状态
+            # STEP 5.3 检测 item_buffer 状态
             if (
             if (
                 self._item_buffer.get_items_count() > 0
                 self._item_buffer.get_items_count() > 0
                 or self._item_buffer.is_adding_to_db()
                 or self._item_buffer.is_adding_to_db()
             ):
             ):
                 return False
                 return False
 
 
-            # 检测 request_buffer 状态
+            # STEP 5.4 检测 request_buffer 状态
             if (
             if (
                 self._request_buffer.get_requests_count() > 0
                 self._request_buffer.get_requests_count() > 0
                 or self._request_buffer.is_adding_to_db()
                 or self._request_buffer.is_adding_to_db()
             ):
             ):
                 return False
                 return False
 
 
-            tools.delay_time(1)
+            tools.delay_time(1) # 休眠一分钟
 
 
         return True
         return True
 
 
@@ -322,16 +329,15 @@ class Scheduler(threading.Thread):
         """
         """
         检查任务状态 预警
         检查任务状态 预警
         """
         """
-        # 每分钟检查一次
+        # step 每分钟检查一次
         now_time = time.time()
         now_time = time.time()
-        if now_time - self._last_check_task_status_time > 30:
+        if now_time - self._last_check_task_status_time > 60:
             self._last_check_task_status_time = now_time
             self._last_check_task_status_time = now_time
         else:
         else:
             return
             return
 
 
-        # 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息
+        # step 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息
         task_count = self._redisdb.zget_count(self._tab_requests)
         task_count = self._redisdb.zget_count(self._tab_requests)
-        print(task_count)
 
 
         if task_count:
         if task_count:
             if task_count != self._last_task_count:
             if task_count != self._last_task_count:
@@ -342,7 +348,7 @@ class Scheduler(threading.Thread):
                     tools.get_current_timestamp(),
                     tools.get_current_timestamp(),
                 )  # 多进程会重复发消息, 使用reids记录上次统计时间
                 )  # 多进程会重复发消息, 使用reids记录上次统计时间
             else:
             else:
-                # 判断时间间隔是否超过20分钟
+                # step 判断时间间隔是否超过20分钟
                 lua = """
                 lua = """
                     -- local key = KEYS[1]
                     -- local key = KEYS[1]
                     local field = ARGV[1]
                     local field = ARGV[1]
@@ -350,7 +356,7 @@ class Scheduler(threading.Thread):
 
 
                     -- 取值
                     -- 取值
                     local last_timestamp = redis.call('hget', KEYS[1], field)
                     local last_timestamp = redis.call('hget', KEYS[1], field)
-                    if last_timestamp and current_timestamp - last_timestamp >= 600 then
+                    if last_timestamp and current_timestamp - last_timestamp >= 1200 then
                         return current_timestamp - last_timestamp -- 返回任务停滞时间 秒
                         return current_timestamp - last_timestamp -- 返回任务停滞时间 秒
                     end
                     end
 
 
@@ -372,13 +378,11 @@ class Scheduler(threading.Thread):
                 )
                 )
 
 
                 if overtime:
                 if overtime:
-                    # 发送报警
+                    # step 记录日志,并发送报警
                     msg = "{}  爬虫任务停滞 {},请检查爬虫是否正常".format(
                     msg = "{}  爬虫任务停滞 {},请检查爬虫是否正常".format(
                         self._spider_name, tools.format_seconds(overtime)
                         self._spider_name, tools.format_seconds(overtime)
                     )
                     )
-                    log.error(msg)
-                    log.error("爬虫任务异常停滞,爬虫将强制退出")
-                    exit()
+                    log.error(msg)  # TODO 这一步可以加一个print,在平台的日志框里输出
                     self.send_msg(
                     self.send_msg(
                         msg,
                         msg,
                         level="error",
                         level="error",
@@ -459,9 +463,20 @@ class Scheduler(threading.Thread):
         self._started.clear()
         self._started.clear()
 
 
     def send_msg(self, msg, level="debug", message_prefix=""):
     def send_msg(self, msg, level="debug", message_prefix=""):
+        #TODO 这个方法是消息预警,但如果每次都发送,会造成消息轰炸,所以采集框架的消息预警没有开启,
+        # 后续优化方向,消息预警的内容可以通过接口,接受保存,并对内容紧急度进行分辨,紧急度高的消息,可以直接发送至微信群中,这里尽量不要直接存储,feapder
+        # 框架不进行mongo的直接存储,只做查询操作
         # log.debug("发送报警 level:{} msg{}".format(level, msg))
         # log.debug("发送报警 level:{} msg{}".format(level, msg))
         tools.send_msg(msg=msg, level=level, message_prefix=message_prefix)
         tools.send_msg(msg=msg, level=level, message_prefix=message_prefix)
 
 
+    def get_argvs(self):
+        argvs = {"next_page": False, "max_page": 10}
+        for item in sys.argv[1:]:
+            print(item)
+            if item.startswith("--"):
+                argvs[item.replace("--", "").split('=')[0]] = eval(item.split('=')[-1]) # 此处使用eval的原因是字符串转bool或int
+        return json.loads(json.dumps(argvs), object_hook=Obj)
+
     def spider_begin(self):
     def spider_begin(self):
         """
         """
         @summary: start_monitor_task 方式启动,此函数与spider_end不在同一进程内,变量不可共享
         @summary: start_monitor_task 方式启动,此函数与spider_end不在同一进程内,变量不可共享
@@ -474,6 +489,8 @@ class Scheduler(threading.Thread):
             self._begin_callback()
             self._begin_callback()
 
 
         for parser in self._parsers:
         for parser in self._parsers:
+            parser.platform_next_page = self.get_argvs().next_page
+            parser.platform_max_page = self.get_argvs().max_page
             parser.start_callback()
             parser.start_callback()
 
 
         # 记录开始时间
         # 记录开始时间
@@ -486,16 +503,16 @@ class Scheduler(threading.Thread):
             # 发送消息
             # 发送消息
             # self.send_msg("《%s》爬虫开始" % self._spider_name)
             # self.send_msg("《%s》爬虫开始" % self._spider_name)
 
 
-    def spider_end(self):
+    def spider_end(self): # step end 爬虫结束时的一些操作
         self.record_end_time()
         self.record_end_time()
 
 
-        if self._end_callback:
+        if self._end_callback:  # 系统自带的回调,如果自定义回调,则这个回调不会执行
             self._end_callback()
             self._end_callback()
 
 
         for parser in self._parsers:
         for parser in self._parsers:
             if not self._keep_alive:
             if not self._keep_alive:
-                parser.close()
-            parser.end_callback()
+                parser.close() # 爬虫可自定义close
+            parser.end_callback() # 调用结束回调函数,可在爬虫自定义
 
 
         if not self._keep_alive:
         if not self._keep_alive:
             # 关闭webdirver
             # 关闭webdirver
@@ -537,10 +554,10 @@ class Scheduler(threading.Thread):
                 self._tab_spider_time, SPIDER_END_TIME_KEY, current_timestamp
                 self._tab_spider_time, SPIDER_END_TIME_KEY, current_timestamp
             )
             )
 
 
-    def is_reach_next_spider_time(self):
+    def is_reach_next_spider_time(self): # 如果没有设置爬虫的启动时间,这一块儿不需要管的
         if not self._batch_interval:
         if not self._batch_interval:
             return True
             return True
-
+        # 下面是对上次执行完成的时间和当前时间的一个校验,不在规定范围内则不启动爬虫,阻塞等待时间到达后再运行爬虫
         last_spider_end_time = self._redisdb.hget(
         last_spider_end_time = self._redisdb.hget(
             self._tab_spider_time, SPIDER_END_TIME_KEY
             self._tab_spider_time, SPIDER_END_TIME_KEY
         )
         )

+ 76 - 116
FworkSpider/feapder/dedup/__init__.py

@@ -2,98 +2,48 @@
 """
 """
 Created on 2018-12-13 21:08
 Created on 2018-12-13 21:08
 ---------
 ---------
-@summary:
+@summary:  sha256 redis集群去重,正式环境使用的去重方式
 ---------
 ---------
 @author: Boris
 @author: Boris
 @email: boris_liu@foxmail.com
 @email: boris_liu@foxmail.com
 """
 """
 
 
 import copy
 import copy
-from typing import Any, List, Union, Optional, Tuple, Callable
-
-from feapder.utils.tools import get_md5
-from .bloomfilter import BloomFilter, ScalableBloomFilter
-from .expirefilter import ExpireFilter
-
+from typing import Any, List, Union, Tuple, Callable
+import rediscluster
+from Crypto.Hash import SHA256
+from feapder import setting
 
 
 class Dedup:
 class Dedup:
     BloomFilter = 1
     BloomFilter = 1
     MemoryFilter = 2
     MemoryFilter = 2
     ExpireFilter = 3
     ExpireFilter = 3
+    def __init__(self,ilter_type: int = BloomFilter):
+        self._to_sha256 = True
+        self._to_redis = None
 
 
-    def __init__(self, filter_type: int = BloomFilter, to_md5: bool = True, **kwargs):
-        """
-        去重过滤器 集成BloomFilter、MemoryFilter、ExpireFilter
-        Args:
-            filter_type: 过滤器类型 BloomFilter
-            name: 过滤器名称 该名称会默认以dedup作为前缀 dedup:expire_set:[name]/dedup:bloomfilter:[name]。 默认ExpireFilter name=过期时间; BloomFilter name=dedup:bloomfilter:bloomfilter
-            absolute_name: 过滤器绝对名称 不会加dedup前缀,当此值不为空时name参数无效
-            expire_time: ExpireFilter的过期时间 单位为秒,其他两种过滤器不用指定
-            error_rate: BloomFilter/MemoryFilter的误判率 默认为0.00001
-            to_md5: 去重前是否将数据转为MD5,默认是
-            redis_url: redis://[[username]:[password]]@localhost:6379/0
-                       BloomFilter 与 ExpireFilter 使用
-                       默认会读取setting中的redis配置,若无setting,则需要专递redis_url
-            initial_capacity: 单个布隆过滤器去重容量 默认100000000,当布隆过滤器容量满时会扩展下一个布隆过滤器
-            error_rate:布隆过滤器的误判率 默认0.00001
-            **kwargs:
-        """
-
-        if filter_type == Dedup.ExpireFilter:
-            try:
-                expire_time = kwargs["expire_time"]
-            except:
-                raise ValueError("需传参数 expire_time")
-
-            name = kwargs.get("absolute_name") or "dedup:expire_set:%s" % kwargs.get(
-                "name", expire_time
-            )
-            expire_time_record_key = "dedup:expire_set:expire_time"
-
-            self.dedup = ExpireFilter(
-                name=name,
-                expire_time=expire_time,
-                expire_time_record_key=expire_time_record_key,
-                redis_url=kwargs.get("redis_url"),
-            )
-
-        else:
-            initial_capacity = kwargs.get("initial_capacity", 100000000)
-            error_rate = kwargs.get("error_rate", 0.00001)
-            name = kwargs.get("absolute_name") or "dedup:bloomfilter:" + kwargs.get(
-                "name", "bloomfilter"
-            )
-            if filter_type == Dedup.BloomFilter:
-                self.dedup = ScalableBloomFilter(
-                    name=name,
-                    initial_capacity=initial_capacity,
-                    error_rate=error_rate,
-                    bitarray_type=ScalableBloomFilter.BASE_REDIS,
-                    redis_url=kwargs.get("redis_url"),
-                )
-            elif filter_type == Dedup.MemoryFilter:
-                self.dedup = ScalableBloomFilter(
-                    name=name,
-                    initial_capacity=initial_capacity,
-                    error_rate=error_rate,
-                    bitarray_type=ScalableBloomFilter.BASE_MEMORY,
-                )
-            else:
-                raise ValueError(
-                    "filter_type 类型错误,仅支持 Dedup.BloomFilter、Dedup.MemoryFilter、Dedup.ExpireFilter"
-                )
-
-        self._to_md5 = to_md5
+    @property
+    def redis_cluster(self): # 连接redis集群
+        if not self._to_redis:
+            startup_nodes = [{"host": i.get("host"), "port": i.get("port")} for i in setting.REDISCLUSTER]
+            self._to_redis =  rediscluster.RedisCluster(startup_nodes=startup_nodes, decode_responses=True)
+        return self._to_redis
 
 
     def __repr__(self):
     def __repr__(self):
-        return str(self.dedup)
-
-    def _deal_datas(self, datas):
-        if self._to_md5:
+        return 'sha256'
+    def sha256(self,info):
+        if info is None:
+            return ''
+        res = SHA256.new(info.encode('utf-8'))
+        data = res.hexdigest()
+        return data
+
+    def _deal_datas(self, datas): # 对datas进行加密处理
+        if self._to_sha256:
             if isinstance(datas, list):
             if isinstance(datas, list):
-                keys = [get_md5(data) for data in datas]
+                keys = [self.sha256(data) for data in datas]
             else:
             else:
-                keys = get_md5(datas)
+                keys = self.sha256(datas)
         else:
         else:
             keys = copy.deepcopy(datas)
             keys = copy.deepcopy(datas)
 
 
@@ -108,11 +58,35 @@ class Dedup:
         @param skip_check: 是否直接添加,不检查是否存在 适用于bloomfilter,加快add速度
         @param skip_check: 是否直接添加,不检查是否存在 适用于bloomfilter,加快add速度
         @return: list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
         @return: list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
         """
         """
-
         keys = self._deal_datas(datas)
         keys = self._deal_datas(datas)
-        is_added = self.dedup.add(keys, skip_check)
+        is_added = self.insert_key(keys, skip_check)
 
 
         return is_added
         return is_added
+    def insert_key(self,keys,skip_check):
+        if isinstance(keys, list):
+            for key in keys:
+                if not self.redis_cluster.exists("pylist_"+key):
+                    self.redis_cluster.set("pylist_"+key, 1,ex=86400*365*2)
+        else:
+            if not self.redis_cluster.exists("pylist_"+keys):
+                self.redis_cluster.set("pylist_"+keys,1,ex=86400*365*2)
+
+    def exists(self,keys):
+        exists = []
+        if isinstance(keys, list):
+            for key in keys:
+                exists.append(self.exit_key(key))
+        else:
+            exists.append(self.exit_key(keys))
+        return exists
+    def exit_key(self,key):
+        if self.redis_cluster.exists(key):
+            return True
+        if self.redis_cluster.exists("pylist_"+key):
+            return True
+        return False
+
+
 
 
     def get(self, datas: Union[List[Any], Any]) -> Union[List[Any], Any]:
     def get(self, datas: Union[List[Any], Any]) -> Union[List[Any], Any]:
         """
         """
@@ -121,58 +95,44 @@ class Dedup:
         @return: list / 单个值 (存在返回1 不存在返回0)
         @return: list / 单个值 (存在返回1 不存在返回0)
         """
         """
         keys = self._deal_datas(datas)
         keys = self._deal_datas(datas)
-        is_exists = self.dedup.get(keys)
+        is_exists = self.exists(keys)
 
 
         return is_exists
         return is_exists
 
 
+
     def filter_exist_data(
     def filter_exist_data(
         self,
         self,
         datas: List[Any],
         datas: List[Any],
         *,
         *,
-        datas_fingerprints: Optional[List] = None,
         callback: Callable[[Any], None] = None
         callback: Callable[[Any], None] = None
     ) -> Union[Tuple[List[Any], List[Any]], List[Any]]:
     ) -> Union[Tuple[List[Any], List[Any]], List[Any]]:
         """
         """
         过滤掉已存在的数据
         过滤掉已存在的数据
-        *** 直接修改原来的数据 使用完此方法后 datas, datas_fingerprints 里面的值为去重后的数据
-        @param datas_fingerprints: 数据的唯一指纹 列表
         @param datas: 数据 列表
         @param datas: 数据 列表
         @param callback: 数据已存在时的回调 callback(data)
         @param callback: 数据已存在时的回调 callback(data)
         @return: None
         @return: None
+        [0,1,1]
+        [b,c,d]
+        []
         """
         """
-
-        is_exists = self.get(datas_fingerprints or datas)
-
+        is_exists = self.get(datas)
         dedup_datas = []
         dedup_datas = []
+        while is_exists:
+            data = datas.pop(0)
+            is_exist = is_exists.pop(0)
 
 
-        if datas_fingerprints:
-            dedup_datas_fingerprints = []
-            while is_exists:
-                data = datas.pop(0)
-                is_exist = is_exists.pop(0)
-                data_fingerprint = datas_fingerprints.pop(0)
-
-                if not is_exist:
-                    dedup_datas.append(data)
-                    dedup_datas_fingerprints.append(data_fingerprint)
-                else:
-                    if callback:
-                        callback(data)
-
-            datas_fingerprints.extend(dedup_datas_fingerprints)
-            datas.extend(dedup_datas)
-            return datas, datas_fingerprints
-
-        else:
-            while is_exists:
-                data = datas.pop(0)
-                is_exist = is_exists.pop(0)
-
-                if not is_exist:
-                    dedup_datas.append(data)
-                else:
-                    if callback:
-                        callback(data)
-
-            datas.extend(dedup_datas)
-            return datas
+            if not is_exist:
+                dedup_datas.append(data)
+            else:
+                if callback:
+                    callback(data)
+
+        datas.extend(dedup_datas)
+        return datas
+
+if __name__ == '__main__':
+    dedup = Dedup(Dedup.BloomFilter)
+    href = 'http://www.ccgp-tianjin.gov.cn/viewer.do?id=339715380&ver=2222'
+    ss = dedup.filter_exist_data([href])
+    # res = dedup.add([href,'llk'])
+    print(ss)

+ 178 - 0
FworkSpider/feapder/dedup/old__init__.py

@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-12-13 21:08
+---------
+@summary: 布隆去重,测试框架使用的去重方式
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import copy
+from typing import Any, List, Union, Optional, Tuple, Callable
+
+from feapder.utils.tools import get_md5
+from .bloomfilter import BloomFilter, ScalableBloomFilter
+from .expirefilter import ExpireFilter
+
+
+class Dedup:
+    BloomFilter = 1
+    MemoryFilter = 2
+    ExpireFilter = 3
+
+    def __init__(self, filter_type: int = BloomFilter, to_md5: bool = True, **kwargs):
+        """
+        去重过滤器 集成BloomFilter、MemoryFilter、ExpireFilter
+        Args:
+            filter_type: 过滤器类型 BloomFilter
+            name: 过滤器名称 该名称会默认以dedup作为前缀 dedup:expire_set:[name]/dedup:bloomfilter:[name]。 默认ExpireFilter name=过期时间; BloomFilter name=dedup:bloomfilter:bloomfilter
+            absolute_name: 过滤器绝对名称 不会加dedup前缀,当此值不为空时name参数无效
+            expire_time: ExpireFilter的过期时间 单位为秒,其他两种过滤器不用指定
+            error_rate: BloomFilter/MemoryFilter的误判率 默认为0.00001
+            to_md5: 去重前是否将数据转为MD5,默认是
+            redis_url: redis://[[username]:[password]]@localhost:6379/0
+                       BloomFilter 与 ExpireFilter 使用
+                       默认会读取setting中的redis配置,若无setting,则需要专递redis_url
+            initial_capacity: 单个布隆过滤器去重容量 默认100000000,当布隆过滤器容量满时会扩展下一个布隆过滤器
+            error_rate:布隆过滤器的误判率 默认0.00001
+            **kwargs:
+        """
+
+        if filter_type == Dedup.ExpireFilter:
+            try:
+                expire_time = kwargs["expire_time"]
+            except:
+                raise ValueError("需传参数 expire_time")
+
+            name = kwargs.get("absolute_name") or "dedup:expire_set:%s" % kwargs.get(
+                "name", expire_time
+            )
+            expire_time_record_key = "dedup:expire_set:expire_time"
+
+            self.dedup = ExpireFilter(
+                name=name,
+                expire_time=expire_time,
+                expire_time_record_key=expire_time_record_key,
+                redis_url=kwargs.get("redis_url"),
+            )
+
+        else:
+            initial_capacity = kwargs.get("initial_capacity", 100000000)
+            error_rate = kwargs.get("error_rate", 0.00001)
+            name = kwargs.get("absolute_name") or "dedup:bloomfilter:" + kwargs.get(
+                "name", "bloomfilter"
+            )
+            if filter_type == Dedup.BloomFilter:
+                self.dedup = ScalableBloomFilter(
+                    name=name,
+                    initial_capacity=initial_capacity,
+                    error_rate=error_rate,
+                    bitarray_type=ScalableBloomFilter.BASE_REDIS,
+                    redis_url=kwargs.get("redis_url"),
+                )
+            elif filter_type == Dedup.MemoryFilter:
+                self.dedup = ScalableBloomFilter(
+                    name=name,
+                    initial_capacity=initial_capacity,
+                    error_rate=error_rate,
+                    bitarray_type=ScalableBloomFilter.BASE_MEMORY,
+                )
+            else:
+                raise ValueError(
+                    "filter_type 类型错误,仅支持 Dedup.BloomFilter、Dedup.MemoryFilter、Dedup.ExpireFilter"
+                )
+
+        self._to_md5 = to_md5
+
+    def __repr__(self):
+        return str(self.dedup)
+
+    def _deal_datas(self, datas):
+        if self._to_md5:
+            if isinstance(datas, list):
+                keys = [get_md5(data) for data in datas]
+            else:
+                keys = get_md5(datas)
+        else:
+            keys = copy.deepcopy(datas)
+
+        return keys
+
+    def add(
+        self, datas: Union[List[Any], Any], skip_check: bool = False
+    ) -> Union[List[Any], Any]:
+        """
+        添加数据
+        @param datas: list / 单个值
+        @param skip_check: 是否直接添加,不检查是否存在 适用于bloomfilter,加快add速度
+        @return: list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
+        """
+
+        keys = self._deal_datas(datas)
+        is_added = self.dedup.add(keys, skip_check)
+
+        return is_added
+
+    def get(self, datas: Union[List[Any], Any]) -> Union[List[Any], Any]:
+        """
+        检查数据是否存在
+        @param datas: list / 单个值
+        @return: list / 单个值 (存在返回1 不存在返回0)
+        """
+        keys = self._deal_datas(datas)
+        is_exists = self.dedup.get(keys)
+
+        return is_exists
+
+    def filter_exist_data(
+        self,
+        datas: List[Any],
+        *,
+        datas_fingerprints: Optional[List] = None,
+        callback: Callable[[Any], None] = None
+    ) -> Union[Tuple[List[Any], List[Any]], List[Any]]:
+        """
+        过滤掉已存在的数据
+        *** 直接修改原来的数据 使用完此方法后 datas, datas_fingerprints 里面的值为去重后的数据
+        @param datas_fingerprints: 数据的唯一指纹 列表
+        @param datas: 数据 列表
+        @param callback: 数据已存在时的回调 callback(data)
+        @return: None
+        """
+
+        is_exists = self.get(datas_fingerprints or datas)
+
+        dedup_datas = []
+
+        if datas_fingerprints:
+            dedup_datas_fingerprints = []
+            while is_exists:
+                data = datas.pop(0)
+                is_exist = is_exists.pop(0)
+                data_fingerprint = datas_fingerprints.pop(0)
+
+                if not is_exist:
+                    dedup_datas.append(data)
+                    dedup_datas_fingerprints.append(data_fingerprint)
+                else:
+                    if callback:
+                        callback(data)
+
+            datas_fingerprints.extend(dedup_datas_fingerprints)
+            datas.extend(dedup_datas)
+            return datas, datas_fingerprints
+
+        else:
+            while is_exists:
+                data = datas.pop(0)
+                is_exist = is_exists.pop(0)
+
+                if not is_exist:
+                    dedup_datas.append(data)
+                else:
+                    if callback:
+                        callback(data)
+
+            datas.extend(dedup_datas)
+            return datas

+ 1 - 3
FworkSpider/feapder/network/cookie_pool.py

@@ -103,9 +103,7 @@ class PageCookiePool(CookiePoolInterface):
         """
         """
         with WebDriver(**self._kwargs) as driver:
         with WebDriver(**self._kwargs) as driver:
             driver.get(self._page_url)
             driver.get(self._page_url)
-
             cookies = driver.get_cookies()
             cookies = driver.get_cookies()
-
             cookies_json = {}
             cookies_json = {}
             for cookie in cookies:
             for cookie in cookies:
                 cookies_json[cookie["name"]] = cookie["value"]
                 cookies_json[cookie["name"]] = cookie["value"]
@@ -242,7 +240,7 @@ class LoginCookiePool(CookiePoolInterface):
         self._password_key = password_key
         self._password_key = password_key
 
 
         self._redisdb = RedisDB()
         self._redisdb = RedisDB()
-        self._mysqldb = MysqlDB()
+        self._mysqldb = ()
 
 
         self.create_userbase()
         self.create_userbase()
 
 

+ 1 - 1
FworkSpider/feapder/network/proxy_pool.py

@@ -1,6 +1,6 @@
 # coding:utf8
 # coding:utf8
 """
 """
-代理池
+代理池  弃用
 """
 """
 import datetime
 import datetime
 import json
 import json

+ 83 - 27
FworkSpider/feapder/network/request.py

@@ -31,7 +31,6 @@ class Request(object):
     session = None
     session = None
     webdriver_pool: WebDriverPool = None
     webdriver_pool: WebDriverPool = None
     user_agent_pool = user_agent
     user_agent_pool = user_agent
-    proxies_pool: ProxyPool = None
 
 
     cache_db = None  # redis / pika
     cache_db = None  # redis / pika
     cached_redis_key = None  # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
     cached_redis_key = None  # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
@@ -91,6 +90,8 @@ class Request(object):
         is_abandoned=False,
         is_abandoned=False,
         render=False,
         render=False,
         render_time=0,
         render_time=0,
+        splash=False,
+        iframes=0,
         **kwargs,
         **kwargs,
     ):
     ):
         """
         """
@@ -146,6 +147,8 @@ class Request(object):
         self.download_midware = download_midware
         self.download_midware = download_midware
         self.is_abandoned = is_abandoned
         self.is_abandoned = is_abandoned
         self.render = render
         self.render = render
+        self.splash = splash
+        self.iframes = iframes
         self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
         self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
 
 
         self.requests_kwargs = {}
         self.requests_kwargs = {}
@@ -197,12 +200,6 @@ class Request(object):
 
 
         return self.__class__.webdriver_pool
         return self.__class__.webdriver_pool
 
 
-    @property
-    def _proxies_pool(self):
-        if not self.__class__.proxies_pool:
-            self.__class__.proxies_pool = ProxyPool()
-
-        return self.__class__.proxies_pool
 
 
     @property
     @property
     def to_dict(self):
     def to_dict(self):
@@ -295,14 +292,15 @@ class Request(object):
 
 
         # 代理
         # 代理
         proxies = self.requests_kwargs.get("proxies", -1)
         proxies = self.requests_kwargs.get("proxies", -1)
-        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
-            while True:
-                proxies = self._proxies_pool.get()
-                if proxies:
-                    self.requests_kwargs.update(proxies=proxies)
-                    break
-                else:
-                    log.debug("暂无可用代理 ...")
+        if not self.render:
+            if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
+                while True:
+                    proxies = self.get_proxy()
+                    if proxies:
+                        self.requests_kwargs.update(proxies=proxies)
+                        break
+                    else:
+                        log.debug("暂无可用代理 ...")
 
 
         log.debug(
         log.debug(
             """
             """
@@ -331,10 +329,6 @@ class Request(object):
             )
             )
         )
         )
 
 
-        # def hooks(response, *args, **kwargs):
-        #     print(response.url)
-        #
-        # self.requests_kwargs.update(hooks={'response': hooks})
 
 
         use_session = (
         use_session = (
             setting.USE_SESSION if self.use_session is None else self.use_session
             setting.USE_SESSION if self.use_session is None else self.use_session
@@ -353,15 +347,12 @@ class Request(object):
                 if cookie_str:
                 if cookie_str:
                     cookies = tools.get_cookies_from_str(cookie_str)
                     cookies = tools.get_cookies_from_str(cookie_str)
 
 
-            proxy = None
-            if proxies and proxies != -1:
-                proxy = proxies.get("http", "").strip("http://") or proxies.get(
-                    "https", ""
-                ).strip("https://")
 
 
-            browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy)
+            browser = self._webdriver_pool.get(user_agent=user_agent, proxy=False)
 
 
             try:
             try:
+                if proxies:
+                    self.chage_ip(browser)
                 browser.get(self.url)
                 browser.get(self.url)
                 if cookies:
                 if cookies:
                     browser.cookies = cookies
                     browser.cookies = cookies
@@ -393,6 +384,49 @@ class Request(object):
         elif use_session:
         elif use_session:
             response = self._session.request(method, self.url, **self.requests_kwargs)
             response = self._session.request(method, self.url, **self.requests_kwargs)
             response = Response(response)
             response = Response(response)
+        elif self.splash:
+            resp = requests.get(setting.JIANYU_SPLASH_URL, params={
+                'iframes': self.iframes,
+                'wait': self.render_time,
+                'html': 1,
+                'proxy': self.get_proxy().get("http"),
+                'url': self.url
+            })
+
+            response = Response(resp)
+
+            # if self.iframes:
+            # # response = Response(resp)
+            #     res = resp.json()
+            #     response = Response.from_dict(
+            #         {
+            #             "url": self.url,
+            #             "cookies": resp.cookies,
+            #             "_content": res.get("html"),
+            #             "status_code": 200,
+            #             "resp": resp,
+            #             "elapsed": 666,
+            #             "headers":resp.headers
+            #         }
+            #     )
+            # else:
+            #     res = resp.json()
+            #     html = res.get("html")
+            #     for item in res.get("childFrames"):
+            #         html += item.get("html")
+            #
+            #     response = Response.from_dict(
+            #         {
+            #             "url": self.url,
+            #             "cookies": resp.cookies,
+            #             "_content": html,
+            #             "status_code": 200,
+            #             "resp": res,
+            #             "elapsed": 666,
+            #             "headers": resp.headers
+            #
+            #         }
+            #     )
         else:
         else:
             response = requests.request(method, self.url, **self.requests_kwargs)
             response = requests.request(method, self.url, **self.requests_kwargs)
             response = Response(response)
             response = Response(response)
@@ -404,9 +438,7 @@ class Request(object):
 
 
     def proxies(self):
     def proxies(self):
         """
         """
-
         Returns: {"https": "https://ip:port", "http": "http://ip:port"}
         Returns: {"https": "https://ip:port", "http": "http://ip:port"}
-
         """
         """
         return self.requests_kwargs.get("proxies")
         return self.requests_kwargs.get("proxies")
 
 
@@ -422,6 +454,29 @@ class Request(object):
                 "https", ""
                 "https", ""
             ).strip("https://")
             ).strip("https://")
 
 
+    def get_proxy(self):
+        headers = {
+            "Authorization": setting.JIANYU_PROXY_AUTHOR
+        }
+        proxy = requests.get(setting.JIANYU_PROXY_URL, headers=headers).json()
+        print(f"切换代理:{proxy.get('data')}")
+        return proxy.get("data")
+
+    def chage_ip(self,browser):
+        ip = self.get_proxy().get("http")  # ip格式"127.0.0.1:80"
+        ip = ip.split("//")[-1]
+        browser.get("about:config")
+        browser.find_element_by_id("warningButton").click()
+        # js代码
+        setupScript = '''var prefs = Components.classes["@mozilla.org/preferences-service;1"].getService(Components.interfaces.nsIPrefBranch);
+        prefs.setIntPref("network.proxy.type", 1);
+        prefs.setCharPref("network.proxy.socks", "%s");
+        prefs.setIntPref("network.proxy.socks_port", "%s");
+        ''' % (
+        ip.split(':')[0], ip.split(':')[1])
+        # 执行js
+        browser.execute_script(setupScript)
+
     def user_agent(self):
     def user_agent(self):
         headers = self.requests_kwargs.get("headers")
         headers = self.requests_kwargs.get("headers")
         if headers:
         if headers:
@@ -490,6 +545,7 @@ class Request(object):
             try:
             try:
                 response_obj = self.get_response(save_cached=save_cached)
                 response_obj = self.get_response(save_cached=save_cached)
             except FunctionTimedOut:
             except FunctionTimedOut:
+                response_obj = None
                 log.info("请求超时")
                 log.info("请求超时")
                 log.info("requests", extra={"url": self.url, "code": 0})
                 log.info("requests", extra={"url": self.url, "code": 0})
 
 

+ 513 - 0
FworkSpider/feapder/network/request6.29.py

@@ -0,0 +1,513 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-07-25 11:49:08
+---------
+@summary: 请求结构体
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+
+import requests
+from func_timeout import func_set_timeout, FunctionTimedOut
+from requests.adapters import HTTPAdapter
+from requests.cookies import RequestsCookieJar
+from requests.packages.urllib3.exceptions import InsecureRequestWarning
+
+import feapder.setting as setting
+import feapder.utils.tools as tools
+from feapder.db.redisdb import RedisDB
+from feapder.network import user_agent
+from feapder.network.proxy_pool import ProxyPool
+from feapder.network.response import Response
+from feapder.utils.log import Log
+from feapder.utils.webdriver import WebDriverPool
+log = Log()
+# 屏蔽warning信息
+requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
+
+
+class Request(object):
+    session = None
+    webdriver_pool: WebDriverPool = None
+    user_agent_pool = user_agent
+    proxies_pool: ProxyPool = None
+
+    cache_db = None  # redis / pika
+    cached_redis_key = None  # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
+    cached_expire_time = 1200  # 缓存过期时间
+
+    local_filepath = None
+    oss_handler = None
+
+    __REQUEST_ATTRS__ = {
+        # 'method', 'url', 必须传递 不加入**kwargs中
+        "params",
+        "data",
+        "headers",
+        "cookies",
+        "files",
+        "auth",
+        "timeout",
+        "allow_redirects",
+        "proxies",
+        "hooks",
+        "stream",
+        "verify",
+        "cert",
+        "json",
+    }
+
+    DEFAULT_KEY_VALUE = dict(
+        url="",
+        retry_times=0,
+        priority=300,
+        parser_name=None,
+        callback=None,
+        filter_repeat=True,
+        auto_request=True,
+        request_sync=False,
+        use_session=None,
+        random_user_agent=True,
+        download_midware=None,
+        is_abandoned=False,
+        render=False,
+        render_time=0,
+    )
+
+    def __init__(
+        self,
+        url="",
+        retry_times=0,
+        priority=300,
+        parser_name=None,
+        callback=None,
+        filter_repeat=True,
+        auto_request=True,
+        request_sync=False,
+        use_session=None,
+        random_user_agent=True,
+        download_midware=None,
+        is_abandoned=False,
+        render=False,
+        render_time=0,
+        **kwargs,
+    ):
+        """
+        @summary: Request参数
+        ---------
+        框架参数
+        @param url: 待抓取url
+        @param retry_times: 当前重试次数
+        @param priority: 优先级 越小越优先 默认300
+        @param parser_name: 回调函数所在的类名 默认为当前类
+        @param callback: 回调函数 可以是函数 也可是函数名(如想跨类回调时,parser_name指定那个类名,callback指定那个类想回调的方法名即可)
+        @param filter_repeat: 是否需要去重 (True/False) 当setting中的REQUEST_FILTER_ENABLE设置为True时该参数生效 默认True
+        @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空,需要自己去请求网页
+        @param request_sync: 是否同步请求下载网页,默认异步。如果该请求url过期时间快,可设置为True,相当于yield的reqeust会立即响应,而不是去排队
+        @param use_session: 是否使用session方式
+        @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
+        @param download_midware: 下载中间件。默认为parser中的download_midware
+        @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
+        @param render: 是否用浏览器渲染
+        @param render_time: 渲染时长,即打开网页等待指定时间后再获取源码
+        --
+        以下参数与requests参数使用方式一致
+        @param method: 请求方式,如POST或GET,默认根据data值是否为空来判断
+        @param params: 请求参数
+        @param data: 请求body
+        @param json: 请求json字符串,同 json.dumps(data)
+        @param headers:
+        @param cookies: 字典 或 CookieJar 对象
+        @param files:
+        @param auth:
+        @param timeout: (浮点或元组)等待服务器数据的超时限制,是一个浮点数,或是一个(connect timeout, read timeout) 元组
+        @param allow_redirects : Boolean. True 表示允许跟踪 POST/PUT/DELETE 方法的重定向
+        @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
+        @param verify: 为 True 时将会验证 SSL 证书
+        @param stream: 如果为 False,将会立即下载响应内容
+        @param cert:
+        --
+        @param **kwargs: 其他值: 如 Request(item=item) 则item可直接用 request.item 取出
+        ---------
+        @result:
+        """
+
+        self.url = url
+        self.retry_times = retry_times
+        self.priority = priority
+        self.parser_name = parser_name
+        self.callback = callback
+        self.filter_repeat = filter_repeat
+        self.auto_request = auto_request
+        self.request_sync = request_sync
+        self.use_session = use_session
+        self.random_user_agent = random_user_agent
+        self.download_midware = download_midware
+        self.is_abandoned = is_abandoned
+        self.render = render
+        self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
+
+        self.requests_kwargs = {}
+        for key, value in kwargs.items():
+            if key in self.__class__.__REQUEST_ATTRS__:  # 取requests参数
+                self.requests_kwargs[key] = value
+
+            self.__dict__[key] = value
+
+    def __repr__(self):
+        try:
+            return "<Request {}>".format(self.url)
+        except:
+            return "<Request {}>".format(str(self.to_dict)[:40])
+
+    def __setattr__(self, key, value):
+        """
+        针对 request.xxx = xxx 的形式,更新reqeust及内部参数值
+        @param key:
+        @param value:
+        @return:
+        """
+        self.__dict__[key] = value
+
+        if key in self.__class__.__REQUEST_ATTRS__:
+            self.requests_kwargs[key] = value
+
+    def __lt__(self, other):
+        return self.priority < other.priority
+
+    @property
+    def _session(self):
+        use_session = (
+            setting.USE_SESSION if self.use_session is None else self.use_session
+        )  # self.use_session 优先级高
+        if use_session and not self.__class__.session:
+            self.__class__.session = requests.Session()
+            # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
+            http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
+            # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
+            self.__class__.session.mount("http", http_adapter)
+
+        return self.__class__.session
+
+    @property
+    def _webdriver_pool(self):
+        if not self.__class__.webdriver_pool:
+            self.__class__.webdriver_pool = WebDriverPool(**setting.WEBDRIVER)
+
+        return self.__class__.webdriver_pool
+
+    @property
+    def _proxies_pool(self):
+        if not self.__class__.proxies_pool:
+            self.__class__.proxies_pool = ProxyPool()
+
+        return self.__class__.proxies_pool
+
+    @property
+    def to_dict(self):
+        request_dict = {}
+
+        self.callback = (
+            getattr(self.callback, "__name__")
+            if callable(self.callback)
+            else self.callback
+        )
+        self.download_midware = (
+            getattr(self.download_midware, "__name__")
+            if callable(self.download_midware)
+            else self.download_midware
+        )
+
+        for key, value in self.__dict__.items():
+            if (
+                key in self.__class__.DEFAULT_KEY_VALUE
+                and self.__class__.DEFAULT_KEY_VALUE.get(key) == value
+                or key == "requests_kwargs"
+            ):
+                continue
+
+            if key in self.__class__.__REQUEST_ATTRS__:
+                if not isinstance(
+                    value, (bytes, bool, float, int, str, tuple, list, dict)
+                ):
+                    value = tools.dumps_obj(value)
+            else:
+                if not isinstance(value, (bytes, bool, float, int, str)):
+                    value = tools.dumps_obj(value)
+
+            request_dict[key] = value
+
+        return request_dict
+
+    @property
+    def callback_name(self):
+        return (
+            getattr(self.callback, "__name__")
+            if callable(self.callback)
+            else self.callback
+        )
+
+    @func_set_timeout(30)
+    def get_response(self, save_cached=False):
+        """
+        获取带有selector功能的response
+        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
+        @return:
+        """
+        # 设置超时默认时间
+        self.requests_kwargs.setdefault(
+            "timeout", setting.REQUEST_TIMEOUT
+        )  # connect=22 read=22
+
+        # 设置stream
+        # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
+        self.requests_kwargs.setdefault("stream", True)
+
+        # 关闭证书验证
+        self.requests_kwargs.setdefault("verify", False)
+
+        # 设置请求方法
+        method = self.__dict__.get("method")
+        if not method:
+            if "data" in self.requests_kwargs:
+                method = "POST"
+            else:
+                method = "GET"
+
+        # 随机user—agent
+        headers = self.requests_kwargs.get("headers", {})
+        if "user-agent" not in headers and "User-Agent" not in headers:
+            if self.render:  # 如果是渲染默认,优先使用WEBDRIVER中配置的ua
+                ua = setting.WEBDRIVER.get(
+                    "user_agent"
+                ) or self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
+            else:
+                ua = self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
+
+            if self.random_user_agent and setting.RANDOM_HEADERS:
+                headers.update({"User-Agent": ua})
+                self.requests_kwargs.update(headers=headers)
+        else:
+            self.requests_kwargs.setdefault(
+                "headers", {"User-Agent": setting.DEFAULT_USERAGENT}
+            )
+
+        # 代理
+        proxies = self.requests_kwargs.get("proxies", -1)
+        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
+            while True:
+                proxies = self._proxies_pool.get()
+                if proxies:
+                    self.requests_kwargs.update(proxies=proxies)
+                    break
+                else:
+                    log.debug("暂无可用代理 ...")
+
+        log.debug(
+            """
+                -------------- %srequest for ----------------
+                url  = %s
+                method = %s
+                body = %s
+                """
+            % (
+                ""
+                if not self.parser_name
+                else "%s.%s "
+                % (
+                    self.parser_name,
+                    (
+                        self.callback
+                        and callable(self.callback)
+                        and getattr(self.callback, "__name__")
+                        or self.callback
+                    )
+                    or "parse",
+                ),
+                self.url,
+                method,
+                self.requests_kwargs,
+            )
+        )
+
+        # def hooks(response, *args, **kwargs):
+        #     print(response.url)
+        #
+        # self.requests_kwargs.update(hooks={'response': hooks})
+
+        use_session = (
+            setting.USE_SESSION if self.use_session is None else self.use_session
+        )  # self.use_session 优先级高
+
+        if self.render:
+            # 使用request的user_agent、cookies、proxy
+            user_agent = headers.get("User-Agent") or headers.get("user-agent")
+            cookies = self.requests_kwargs.get("cookies")
+            print(cookies)
+            if cookies and isinstance(cookies, RequestsCookieJar):
+                cookies = cookies.get_dict()
+
+            if not cookies:
+                cookie_str = headers.get("Cookie") or headers.get("cookie")
+                if cookie_str:
+                    cookies = tools.get_cookies_from_str(cookie_str)
+
+            proxy = None
+            if proxies and proxies != -1:
+                proxy = proxies.get("http", "").strip("http://") or proxies.get(
+                    "https", ""
+                ).strip("https://")
+
+            browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy)
+
+            try:
+                browser.get(self.url)
+                if cookies:
+                    browser.cookies = cookies
+                if self.render_time:
+                    tools.delay_time(self.render_time)
+
+                html = browser.page_source
+                response = Response.from_dict(
+                    {
+                        "url": browser.current_url,
+                        "cookies": browser.cookies,
+                        "_content": html.encode(),
+                        "status_code": 200,
+                        "elapsed": 666,
+                        "headers": {
+                            "User-Agent": browser.execute_script(
+                                "return navigator.userAgent"
+                            ),
+                            "Cookie": tools.cookies2str(browser.cookies),
+                        },
+                    }
+                )
+
+                response.browser = browser
+            except Exception as e:
+                self._webdriver_pool.remove(browser)
+                raise e
+
+        elif use_session:
+            response = self._session.request(method, self.url, **self.requests_kwargs)
+            response = Response(response)
+        else:
+            response = requests.request(method, self.url, **self.requests_kwargs)
+            response = Response(response)
+
+        if save_cached:
+            self.save_cached(response, expire_time=self.__class__.cached_expire_time)
+        log.info("requests",extra={"url":response.url,"code":response.status_code})
+        return response
+
+    def proxies(self):
+        """
+
+        Returns: {"https": "https://ip:port", "http": "http://ip:port"}
+
+        """
+        return self.requests_kwargs.get("proxies")
+
+    def proxy(self):
+        """
+
+        Returns: ip:port
+
+        """
+        proxies = self.proxies()
+        if proxies:
+            return proxies.get("http", "").strip("http://") or proxies.get(
+                "https", ""
+            ).strip("https://")
+
+    def user_agent(self):
+        headers = self.requests_kwargs.get("headers")
+        if headers:
+            return headers.get("user_agent") or headers.get("User-Agent")
+
+    @property
+    def fingerprint(self):
+        """
+        request唯一表识
+        @return:
+        """
+        url = self.__dict__.get("url", "")
+        # url 归一化
+        url = tools.canonicalize_url(url)
+        args = [url]
+
+        for arg in ["params", "data", "files", "auth", "cert", "json"]:
+            if self.requests_kwargs.get(arg):
+                args.append(self.requests_kwargs.get(arg))
+
+        return tools.get_md5(*args)
+
+    @property
+    def _cache_db(self):
+        if not self.__class__.cache_db:
+            self.__class__.cache_db = RedisDB()  # .from_url(setting.pika_spider_1_uri)
+
+        return self.__class__.cache_db
+
+    @property
+    def _cached_redis_key(self):
+        if self.__class__.cached_redis_key:
+            return (
+                f"response_cached:{self.__class__.cached_redis_key}:{self.fingerprint}"
+            )
+        else:
+            return f"response_cached:test:{self.fingerprint}"
+
+    def save_cached(self, response, expire_time=1200):
+        """
+        使用redis保存response 用于调试 不用每回都下载
+        @param response:
+        @param expire_time: 过期时间
+        @return:
+        """
+
+        self._cache_db.strset(self._cached_redis_key, response.to_dict, ex=expire_time)
+
+    def get_response_from_cached(self, save_cached=True):
+        """
+        从缓存中获取response
+        注意:
+            属性值为空:
+                -raw : urllib3.response.HTTPResponse
+                -connection:requests.adapters.HTTPAdapter
+                -history
+
+            属性含义改变:
+                - request 由requests 改为Request
+        @param: save_cached 当无缓存 直接下载 下载完是否保存缓存
+        @return:
+        """
+        response_dict = self._cache_db.strget(self._cached_redis_key)
+        if not response_dict:
+            log.info("无response缓存  重新下载")
+            try:
+                response_obj = self.get_response(save_cached=save_cached)
+            except FunctionTimedOut:
+                log.info("请求超时")
+                log.info("requests", extra={"url": self.url, "code": 0})
+
+        else:
+            response_dict = eval(response_dict)
+            response_obj = Response.from_dict(response_dict)
+        return response_obj
+
+    def del_response_cached(self):
+        self._cache_db.clear(self._cached_redis_key)
+
+    @classmethod
+    def from_dict(cls, request_dict):
+        for key, value in request_dict.items():
+            if isinstance(value, bytes):  # 反序列化 如item
+                request_dict[key] = tools.loads_obj(value)
+
+        return cls(**request_dict)
+
+    def copy(self):
+        return self.__class__.from_dict(self.to_dict)

+ 31 - 13
FworkSpider/feapder/templates/spider_list_template.tmpl

@@ -2,14 +2,14 @@
 """
 """
 Created on {DATE}
 Created on {DATE}
 ---------
 ---------
-@summary:
+@summary: ${spider_name}
 ---------
 ---------
 @author: {USER}
 @author: {USER}
 """
 """
-
+import sys
+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 import feapder
 import feapder
-from items.spider_item import DataBakItem,MgpListItem
-from untils.proxy_pool import ProxyPool
+from items.spider_item import DataBakItem,MgpListItem,ListItem
 from feapder.dedup import Dedup
 from feapder.dedup import Dedup
 from collections import namedtuple
 from collections import namedtuple
 
 
@@ -17,21 +17,20 @@ from collections import namedtuple
 class ${spider_name}(feapder.Spider):
 class ${spider_name}(feapder.Spider):
 
 
     def start_callback(self):
     def start_callback(self):
-         self.count = 0
          Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
          Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
 
 
          self.menus = [
          self.menus = [
-             Menu('${spider_name}', '${spider_name}', "Notice", 1),
-             Menu('${spider_name}', '${spider_name}', "Notice", 1),
+             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
+             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "Notice", 1),
          ]
          ]
     def start_requests(self):
     def start_requests(self):
          for menu in self.menus:
          for menu in self.menus:
-            start_url = f''
-            yield feapder.Request(url=start_url, item=menu._asdict())
+             for page in range(1,menu.crawl_page+1):
+                 start_url = f''
+                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False)
 
 
     def parse(self, request, response):
     def parse(self, request, response):
         menu = request.item
         menu = request.item
-        self.count += 1   # 一个计数器
         dedup = Dedup(Dedup.BloomFilter)
         dedup = Dedup(Dedup.BloomFilter)
         href_list = []
         href_list = []
         info_list = []
         info_list = []
@@ -56,15 +55,34 @@ class ${spider_name}(feapder.Spider):
             list_item.parse = "self.detail_get"
             list_item.parse = "self.detail_get"
             list_item.parser_name = "details"
             list_item.parser_name = "details"
             list_item.item = data_item.to_dict
             list_item.item = data_item.to_dict
-            list_item.xpath = ['//****',"*****"]
-            list_item.author = "****"
+            list_item.deal_detail = ['//div[@class="****"]',"*****"]
+            list_item.proxies = False
             list_item.parse_url = href
             list_item.parse_url = href
+            list_item.pri = 1
+            list.files={
+                "list_xpath":'//div[@class="notice-foot"]/a',
+                "url_xpath":'./@href',
+                "name_xpath":'./text()',
+                "files_type":('zip','doxc','ftp'),
+                "file_type":'zip',
+                "url_key":'attachmentDownload',
+                # "host":'http',
+                "kwargs":{"headers": {
+                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
+                }}
             href_list.append(href)
             href_list.append(href)
             yield list_item
             yield list_item
+        list = ListItem()
+        list.site = self.site
+        list.channel = menu.get("channel")
+        list.spidercode = menu.get("code")
+        list.url = request.url
+        list.count = len(info_list)
+        list.rel_count = len(href_list)
         dedup.add(href_list)
         dedup.add(href_list)
 
 
     def end_callback(self):
     def end_callback(self):
         print("爬虫结束")
         print("爬虫结束")
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    ${spider_name}(redis_key="fwork:${spider_name}").start()
+    ${spider_name}(redis_key="{USER}:${spider_name}").start()

+ 1 - 1
FworkSpider/feapder/templates/spider_template.tmpl

@@ -64,4 +64,4 @@ class ${spider_name}(feapder.Spider):
         return request
         return request
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    ${spider_name}(redis_key="fwork:${spider_name}").start()
+    ${spider_name}(redis_key="{USER}:${spider_name}").start()

+ 37 - 17
FworkSpider/feapder/utils/aliyun.py

@@ -1,3 +1,12 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021/3/18 12:39 上午
+---------
+@summary:  阿里云附件上传
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
 import hashlib
 import hashlib
 import os
 import os
 import traceback
 import traceback
@@ -56,7 +65,7 @@ class UploadOSS:
                 else:
                 else:
                     return "{:.1f} kb".format(_kb)
                     return "{:.1f} kb".format(_kb)
 
 
-    def get_state(self, attachment, **kwargs):
+    def get_state(self, attachment,count=0, **kwargs):
         """
         """
         下载附件并上传阿里oss
         下载附件并上传阿里oss
 
 
@@ -78,7 +87,10 @@ class UploadOSS:
                 if not os.path.exists(img_dir):
                 if not os.path.exists(img_dir):
                     os.makedirs(img_dir, mode=0o777, exist_ok=True)
                     os.makedirs(img_dir, mode=0o777, exist_ok=True)
                 # 打开目录,放入下载的附件
                 # 打开目录,放入下载的附件
-                self.file_path = "{}/{}".format(img_dir, attachment["filename"])
+                filname = hashlib.md5(attachment["filename"].encode("utf-8"))
+                filname = filname.hexdigest() #加密1次
+                types = attachment["ftype"]
+                self.file_path = "{}/{}".format(img_dir, filname+'.'+types)
                 with open(self.file_path, 'wb') as f:
                 with open(self.file_path, 'wb') as f:
                     f.write(self.file_stream)
                     f.write(self.file_stream)
                 # 上传附件
                 # 上传附件
@@ -89,13 +101,16 @@ class UploadOSS:
                 # 返回附件上传处理信息
                 # 返回附件上传处理信息
                 return file_state
                 return file_state
             else:
             else:
-                attachment["ftype"] = str(attachment["filename"]).split(".")[1]
-                attachment["url"] = 'oss'
-                attachment["fid"] = self.fid + "." + attachment["ftype"]
-                attachment["size"] = '0kb'
-                attachment["false"] = True
-                return attachment
-    def post_state(self, attachment, **kwargs):
+                if count<3:
+                    self.post_state(attachment,count=count+1, **kwargs)
+                else:
+                    # attachment["ftype"] = str(attachment["filename"]).split(".")[1]
+                    attachment["url"] = 'oss'
+                    attachment["fid"] = self.fid + "." + attachment["ftype"]
+                    attachment["size"] = '0kb'
+                    attachment["false"] = True
+                    return attachment
+    def post_state(self, attachment,count=0, **kwargs):
         """
         """
         下载附件并上传阿里oss
         下载附件并上传阿里oss
 
 
@@ -116,7 +131,10 @@ class UploadOSS:
                 if not os.path.exists(img_dir):
                 if not os.path.exists(img_dir):
                     os.makedirs(img_dir, mode=0o777, exist_ok=True)
                     os.makedirs(img_dir, mode=0o777, exist_ok=True)
                 # 打开目录,放入下载的附件
                 # 打开目录,放入下载的附件
-                self.file_path = "{}/{}{}".format(img_dir,time.time(),attachment["filename"])
+                filname = hashlib.md5(attachment["filename"].encode("utf-8"))
+                filname = filname.hexdigest()  # 加密1次
+                types = attachment["ftype"]
+                self.file_path = "{}/{}".format(img_dir, filname + '.' + types)
 
 
                 with open(self.file_path, 'wb') as f:
                 with open(self.file_path, 'wb') as f:
                     f.write(self.file_stream)
                     f.write(self.file_stream)
@@ -128,12 +146,14 @@ class UploadOSS:
                 # 返回附件上传处理信息
                 # 返回附件上传处理信息
                 return file_state
                 return file_state
             else:
             else:
-                attachment["ftype"] = str(attachment["filename"]).split(".")[1]
-                attachment["url"] = 'oss'
-                attachment["fid"] = self.fid + "." + attachment["ftype"]
-                attachment["size"] = '0kb'
-                attachment["false"] = True
-                return attachment
+                if count<3:
+                    self.post_state(attachment,count=count+1, **kwargs)
+                else:
+                    attachment["url"] = 'oss'
+                    attachment["fid"] = self.fid + "." + attachment["ftype"]
+                    attachment["size"] = '0kb'
+                    attachment["false"] = True
+                    return attachment
 
 
     def put_oss_from_local(self):
     def put_oss_from_local(self):
         """上传一个本地文件到阿里OSS的普通文件"""
         """上传一个本地文件到阿里OSS的普通文件"""
@@ -148,7 +168,7 @@ class UploadOSS:
         @param attachment: 附件
         @param attachment: 附件
         @return: 附件上传处理信息
         @return: 附件上传处理信息
         """
         """
-        attachment["ftype"] = str(attachment["filename"]).split(".")[1]
+        # attachment["ftype"] = str(attachment["filename"]).split(".")[1]
         attachment["url"] = 'oss'
         attachment["url"] = 'oss'
         attachment["fid"] = self.fid + "." + attachment["ftype"]
         attachment["fid"] = self.fid + "." + attachment["ftype"]
         attachment["size"] = self.file_size
         attachment["size"] = self.file_size

+ 1 - 1
FworkSpider/feapder/utils/email_sender.py

@@ -2,7 +2,7 @@
 """
 """
 Created on 2020/2/19 12:57 PM
 Created on 2020/2/19 12:57 PM
 ---------
 ---------
-@summary:
+@summary: 邮件发送
 ---------
 ---------
 @author: Boris
 @author: Boris
 @email: boris_liu@foxmail.com
 @email: boris_liu@foxmail.com

+ 41 - 47
FworkSpider/feapder/utils/log.py

@@ -10,10 +10,11 @@ Created on 2018-12-08 16:50
 import logging
 import logging
 import os
 import os
 import sys
 import sys
+import time
 from logging.handlers import BaseRotatingHandler
 from logging.handlers import BaseRotatingHandler
 
 
-import logstash
 import loguru
 import loguru
+import pymongo
 from better_exceptions import format_exception
 from better_exceptions import format_exception
 
 
 import feapder.setting as setting
 import feapder.setting as setting
@@ -40,45 +41,47 @@ class RotatingFileHandler(BaseRotatingHandler):
         self.max_bytes = max_bytes
         self.max_bytes = max_bytes
         self.backup_count = backup_count
         self.backup_count = backup_count
         self.placeholder = str(len(str(backup_count)))
         self.placeholder = str(len(str(backup_count)))
+        self._to_db = None
+        self.filename = filename
+
+
+    @property
+    def to_db(self):
+        if not self._to_db:
+            self._to_db = pymongo.MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
+
+        return self._to_db.pyspider
 
 
-    def doRollover(self):
-        if self.stream:
-            self.stream.close()
-            self.stream = None
-        if self.backup_count > 0:
-            for i in range(self.backup_count - 1, 0, -1):
-                sfn = ("%0" + self.placeholder + "d.") % i  # '%2d.'%i -> 02
-                sfn = sfn.join(self.baseFilename.split("."))
-                # sfn = "%d_%s" % (i, self.baseFilename)
-                # dfn = "%d_%s" % (i + 1, self.baseFilename)
-                dfn = ("%0" + self.placeholder + "d.") % (i + 1)
-                dfn = dfn.join(self.baseFilename.split("."))
-                if os.path.exists(sfn):
-                    # print "%s -> %s" % (sfn, dfn)
-                    if os.path.exists(dfn):
-                        os.remove(dfn)
-                    os.rename(sfn, dfn)
-            dfn = (("%0" + self.placeholder + "d.") % 1).join(
-                self.baseFilename.split(".")
-            )
-            if os.path.exists(dfn):
-                os.remove(dfn)
-            # Issue 18940: A file may not have been created if delay is True.
-            if os.path.exists(self.baseFilename):
-                os.rename(self.baseFilename, dfn)
-        if not self.delay:
-            self.stream = self._open()
 
 
     def shouldRollover(self, record):
     def shouldRollover(self, record):
+        parmars = {
+            "spider_name":record.name,
+            "msg":record.msg,
+            "Message":str(record.getMessage)
+        }
+        if record.levelname == "ERROR":
+            crawl_type = 'list'
+            if 'detail' in record.name:
+                crawl_type = 'detail'
+            url = ''
+            item={
+                "recordname":record.name,
+                "spidercode":"spidercode",
+                "author":self.filename,
+                "account":"",
+                "crawl_time":time.time(),
+                "crawl_type": crawl_type,
+                "status_code":"status_code",
+                "url":url,
+                "reason":record.msg,
+                'parmars': parmars,
+            }
+
+            # print('<<<<<<<<<<<<<<<<<<<<<<<插入error_info')
+            # print(item)
+            # print(self.to_db.error_info)
+            # self.to_db.error_info.insert_one(item)
 
 
-        if self.stream is None:  # delay was set...
-            self.stream = self._open()
-        if self.max_bytes > 0:  # are we rolling over?
-            msg = "%s\n" % self.format(record)
-            self.stream.seek(0, 2)  # due to non-posix-compliant Windows feature
-            if self.stream.tell() + len(msg) >= self.max_bytes:
-                return 1
-        return 0
 
 
 
 
 def get_logger(
 def get_logger(
@@ -87,7 +90,6 @@ def get_logger(
     log_level=None,
     log_level=None,
     is_write_to_console=None,
     is_write_to_console=None,
     is_write_to_file=None,
     is_write_to_file=None,
-    is_send_to_logstash = None,
     color=None,
     color=None,
     mode=None,
     mode=None,
     max_bytes=None,
     max_bytes=None,
@@ -111,7 +113,6 @@ def get_logger(
     @result:
     @result:
     """
     """
     # 加载setting里最新的值
     # 加载setting里最新的值
-    # name = os.path.split(os.getcwd())[-1]
     name = name or setting.LOG_NAME
     name = name or setting.LOG_NAME
     path = path or setting.LOG_PATH
     path = path or setting.LOG_PATH
     log_level = log_level or setting.LOG_LEVEL
     log_level = log_level or setting.LOG_LEVEL
@@ -125,11 +126,6 @@ def get_logger(
         if is_write_to_file is not None
         if is_write_to_file is not None
         else setting.LOG_IS_WRITE_TO_FILE
         else setting.LOG_IS_WRITE_TO_FILE
     )
     )
-    is_send_to_logstash = (
-        is_send_to_logstash
-        if is_send_to_logstash is not None
-        else setting.LOG_IS_SEND_TO_LOGSTASH
-    )
     color = color if color is not None else setting.LOG_COLOR
     color = color if color is not None else setting.LOG_COLOR
     mode = mode or setting.LOG_MODE
     mode = mode or setting.LOG_MODE
     max_bytes = max_bytes or setting.LOG_MAX_BYTES
     max_bytes = max_bytes or setting.LOG_MAX_BYTES
@@ -148,8 +144,8 @@ def get_logger(
 
 
     # 定义一个RotatingFileHandler,最多备份5个日志文件,每个日志文件最大10M
     # 定义一个RotatingFileHandler,最多备份5个日志文件,每个日志文件最大10M
     if is_write_to_file:
     if is_write_to_file:
-        if path and not os.path.exists(os.path.dirname(path)):
-            os.makedirs(os.path.dirname(path))
+        # if path and not os.path.exists(os.path.dirname(path)):
+        #     os.makedirs(os.path.dirname(path))
 
 
         rf_handler = RotatingFileHandler(
         rf_handler = RotatingFileHandler(
             path,
             path,
@@ -160,8 +156,6 @@ def get_logger(
         )
         )
         rf_handler.setFormatter(formatter)
         rf_handler.setFormatter(formatter)
         logger.addHandler(rf_handler)
         logger.addHandler(rf_handler)
-    if is_send_to_logstash:
-        logger.addHandler(logstash.TCPLogstashHandler(setting.LOGSTASH_IP, setting.LOGSTASH_PORT, version=1))
     if color and is_write_to_console:
     if color and is_write_to_console:
         loguru_handler = InterceptHandler()
         loguru_handler = InterceptHandler()
         loguru_handler.setFormatter(formatter)
         loguru_handler.setFormatter(formatter)

+ 1 - 1
FworkSpider/feapder/utils/redis_lock.py

@@ -107,7 +107,7 @@ class RedisLock:
                 time.sleep(1)
                 time.sleep(1)
                 continue
                 continue
             self.redis_conn.expire(self.lock_key, expire + 5)  # 延长5秒
             self.redis_conn.expire(self.lock_key, expire + 5)  # 延长5秒
-            time.sleep(5)  # 临过期5秒前,再次延长
+            time.sleep(expire)  # 临过期5秒前,再次延长
             spend_time += expire
             spend_time += expire
             if self.lock_timeout and spend_time > self.lock_timeout:
             if self.lock_timeout and spend_time > self.lock_timeout:
                 log.info("锁超时,释放")
                 log.info("锁超时,释放")

+ 2 - 5
FworkSpider/feapder/utils/tools.py

@@ -7,7 +7,6 @@ Created on 2018-09-06 14:21
 @author: Boris
 @author: Boris
 @email: boris_liu@foxmail.com
 @email: boris_liu@foxmail.com
 """
 """
-print('123木头人')
 import asyncio
 import asyncio
 import calendar
 import calendar
 import codecs
 import codecs
@@ -48,7 +47,6 @@ from w3lib.url import canonicalize_url as _canonicalize_url
 import feapder.setting as setting
 import feapder.setting as setting
 from feapder.utils.email_sender import EmailSender
 from feapder.utils.email_sender import EmailSender
 from feapder.utils.log import log
 from feapder.utils.log import log
-
 os.environ["EXECJS_RUNTIME"] = "Node"  # 设置使用node执行js
 os.environ["EXECJS_RUNTIME"] = "Node"  # 设置使用node执行js
 
 
 # 全局取消ssl证书验证
 # 全局取消ssl证书验证
@@ -58,8 +56,7 @@ TIME_OUT = 30
 TIMER_TIME = 5
 TIMER_TIME = 5
 
 
 redisdb = None
 redisdb = None
-def ccmu():
-    print('sss')
+
 
 
 def get_redisdb():
 def get_redisdb():
     global redisdb
     global redisdb
@@ -75,7 +72,7 @@ def get_redisdb():
     return redisdb
     return redisdb
 
 
 
 
-# 装饰器
+# 装饰器 -- 单例模式
 class Singleton(object):
 class Singleton(object):
     def __init__(self, cls):
     def __init__(self, cls):
         self._cls = cls
         self._cls = cls

+ 3 - 8
FworkSpider/feapder/utils/webdriver.py

@@ -22,6 +22,7 @@ DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit
 
 
 
 
 class WebDriver(RemoteWebDriver):
 class WebDriver(RemoteWebDriver):
+    '''浏览器采集 - selenium'''
     CHROME = "CHROME"
     CHROME = "CHROME"
     PHANTOMJS = "PHANTOMJS"
     PHANTOMJS = "PHANTOMJS"
     FIREFOX = "FIREFOX"
     FIREFOX = "FIREFOX"
@@ -111,12 +112,6 @@ class WebDriver(RemoteWebDriver):
             firefox_profile.set_preference('network.proxy.socks', proxy.split(":")[0])
             firefox_profile.set_preference('network.proxy.socks', proxy.split(":")[0])
             firefox_profile.set_preference('network.proxy.socks_port', int(proxy.split(":")[-1]))
             firefox_profile.set_preference('network.proxy.socks_port', int(proxy.split(":")[-1]))
             # firefox_capabilities["marionette"] = True  # http代理的使用
             # firefox_capabilities["marionette"] = True  # http代理的使用
-            # firefox_capabilities["proxy"] = {
-            #     "proxyType": "MANUAL",
-            #     "httpProxy": proxy,
-            #     "ftpProxy": proxy,
-            #     "sslProxy": proxy,
-            # }
 
 
         if self._user_agent:
         if self._user_agent:
             firefox_profile.set_preference(
             firefox_profile.set_preference(
@@ -279,8 +274,8 @@ class WebDriver(RemoteWebDriver):
         else:
         else:
             raise AttributeError
             raise AttributeError
 
 
-    # def __del__(self):
-    #     self.quit()
+    def __del__(self):
+        self.quit()
 
 
 
 
 @Singleton
 @Singleton

+ 4 - 4
FworkSpider/mongo_pipeline.py

@@ -9,17 +9,16 @@ Created on 2021-04-18 14:12:21
 """
 """
 from typing import Dict, List, Tuple
 from typing import Dict, List, Tuple
 import time
 import time
-# from feapder.db.mongodb import MongoDB
 from feapder.db.redisdb import RedisDB
 from feapder.db.redisdb import RedisDB
 from feapder.dedup import Dedup
 from feapder.dedup import Dedup
 from feapder.pipelines import BasePipeline
 from feapder.pipelines import BasePipeline
 from feapder.utils.log import log
 from feapder.utils.log import log
 from untils.tools import *
 from untils.tools import *
-# from crawlab import save_item
 
 
 
 
 
 
-class MongoPipeline(BasePipeline):
+class RedisPipeline(BasePipeline):
+    '''数据存储管道-redis版'''
     def __init__(self):
     def __init__(self):
         self._to_db = None
         self._to_db = None
 
 
@@ -27,6 +26,7 @@ class MongoPipeline(BasePipeline):
     def to_db(self):
     def to_db(self):
         if not self._to_db:
         if not self._to_db:
             self._to_db = RedisDB()
             self._to_db = RedisDB()
+            print("创建新连接?")
 
 
         return self._to_db
         return self._to_db
 
 
@@ -42,7 +42,7 @@ class MongoPipeline(BasePipeline):
         """
         """
         try:
         try:
             add_count = self.to_db.lpush(table="savemongo:"+table, values=items)
             add_count = self.to_db.lpush(table="savemongo:"+table, values=items)
-            # add_count = self.to_db.lpop(table="savemongo:"+table, values=items)
+            print(add_count)
             datas_size = len(items)
             datas_size = len(items)
             log.info(
             log.info(
                 "共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条"
                 "共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条"

+ 98 - 0
FworkSpider/mongo_pipeline_old.py

@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021-04-18 14:12:21
+---------
+@summary: 导出数据
+---------
+@author: 马国鹏
+@email:  305021384@qq.com
+"""
+from typing import Dict, List, Tuple
+import time
+from feapder.db.mongodb import MongoDB
+from feapder.dedup import Dedup
+from feapder.pipelines import BasePipeline
+from feapder.utils.log import log
+from untils.tools import *
+# from crawlab import save_item
+
+
+
+class MongoPipeline(BasePipeline):
+    def __init__(self):
+        self._to_db = None
+
+    @property
+    def to_db(self):
+        if not self._to_db:
+            self._to_db = MongoDB()
+            print("创建新连接?")
+
+        return self._to_db
+
+    def save_items(self, table, items: List[Dict]) -> bool:
+        """
+        保存数据
+        Args:
+            table: 表名
+            items: 数据,[{},{},...]
+
+        Returns: 是否保存成功 True / False
+                 若False,不会将本批数据入到去重库,以便再次入库
+        """
+        try:
+            add_count = self.to_db.add_batch(coll_name=table, datas=items)
+            for item in items:
+                dedup = Dedup(Dedup.BloomFilter)
+                dedup.add([item.get("href")])
+                # save_item({'count':item.get("href")})
+            datas_size = len(items)
+            log.info(
+                "共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条"
+                % (datas_size, table, add_count, datas_size - add_count)
+            )
+            # wechat_warning(f"{site}  数据导报\n共插入 {datas_size} 条数据到 {table}")
+            # for i in range(add_count):
+            # if table == "mgp_list":
+            #     save_item({"site": "失败回填", "title": add_count})
+
+            return True
+        except Exception as e:
+            log.exception(e)
+            return False
+
+    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
+        """
+        更新数据
+        Args:
+            table: 表名
+            items: 数据,[{},{},...]
+            update_keys: 更新的字段, 如 ("title", "publish_time")
+
+        Returns: 是否更新成功 True / False
+                 若False,不会将本批数据入到去重库,以便再次入库
+
+        """
+        try:
+            # self.to_db.find()
+            add_count = self.to_db.add_batch(
+                coll_name=table,
+                datas=items,
+                update_columns=update_keys or list(items[0].keys()),
+            )
+            datas_size = len(items)
+            update_count = datas_size - add_count
+            msg = "共导出 %s 条数据到 %s,  新增 %s 条, 更新 %s 条" % (
+                datas_size,
+                table,
+                add_count,
+                update_count,
+            )
+            if update_keys:
+                msg += " 更新字段为 {}".format(update_keys)
+            log.info(msg)
+
+            return True
+        except Exception as e:
+            log.exception(e)
+            return False

+ 39 - 10
FworkSpider/setting.py

@@ -24,11 +24,11 @@ MONGO_DB = "py_spider"
 # # REDIS
 # # REDIS
 # # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
 # # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
 # REDISDB_IP_PORTS = "192.168.20.51:6379"  # 本地 docker 环境
 # REDISDB_IP_PORTS = "192.168.20.51:6379"  # 本地 docker 环境
-REDISDB_IP_PORTS = "172.19.0.1:6379"  # 本地环境
+REDISDB_IP_PORTS = "172.19.0.1:6379"  # 环境
 # REDISDB_USER_PASS = ""
 # REDISDB_USER_PASS = ""
 REDISDB_DB = 10
 REDISDB_DB = 10
 # # 适用于redis哨兵模式
 # # 适用于redis哨兵模式
-REDISDB_SERVICE_NAME = "quchoong"
+REDISDB_SERVICE_NAME = "quchoong"  # 没用到
 #
 #
 # # 数据入库的pipeline,可自定义,默认MysqlPipeline
 # # 数据入库的pipeline,可自定义,默认MysqlPipeline
 ITEM_PIPELINES = [
 ITEM_PIPELINES = [
@@ -44,9 +44,9 @@ EXPORT_DATA_MAX_RETRY_TIMES = 5 # 导出数据时最大的重试次数,包括
 # COLLECTOR_SLEEP_TIME = 1  # 从任务队列中获取任务到内存队列的间隔
 # COLLECTOR_SLEEP_TIME = 1  # 从任务队列中获取任务到内存队列的间隔
 # COLLECTOR_TASK_COUNT = 10  # 每次获取任务数量
 # COLLECTOR_TASK_COUNT = 10  # 每次获取任务数量
 #
 #
-REDIS_KEY = "fwork"
+REDIS_KEY = "fwork" # 没用到
 # # SPIDER
 # # SPIDER
-# SPIDER_THREAD_COUNT = 4  # 爬虫并发数
+SPIDER_THREAD_COUNT = 1  # 爬虫并发数
 # SPIDER_SLEEP_TIME = [2, 5] # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数,包含2和5
 # SPIDER_SLEEP_TIME = [2, 5] # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数,包含2和5
 # SPIDER_TASK_COUNT = 1  # 每个parser从内存队列中获取任务的数量
 # SPIDER_TASK_COUNT = 1  # 每个parser从内存队列中获取任务的数量
 SPIDER_MAX_RETRY_TIMES = 5  # 每个请求最大重试次数
 SPIDER_MAX_RETRY_TIMES = 5  # 每个请求最大重试次数
@@ -123,7 +123,8 @@ WECHAT_WARNING_PHONE = "马国鹏"  # 报警人 将会在群内@此人, 支持
 WECHAT_WARNING_ALL = True  # 是否提示所有人, 默认为False
 WECHAT_WARNING_ALL = True  # 是否提示所有人, 默认为False
 # # 时间间隔
 # # 时间间隔
 WARNING_INTERVAL = 360  # 相同报警的报警时间间隔,防止刷屏; 0表示不去重
 WARNING_INTERVAL = 360  # 相同报警的报警时间间隔,防止刷屏; 0表示不去重
-WARNING_LEVEL = "DEBUG"  # 报警级别, DEBUG / ERROR
+# WARNING_LEVEL = "DEBUG"  # 报警级别, DEBUG / ERROR
+WARNING_LEVEL = "INFO"  # 报警级别, DEBUG / ERROR
 WARNING_FAILED_COUNT = 2  # 任务失败数 超过WARNING_FAILED_COUNT则报警
 WARNING_FAILED_COUNT = 2  # 任务失败数 超过WARNING_FAILED_COUNT则报警
 #
 #
 #LOG_NAME = os.path.basename(os.getcwd())
 #LOG_NAME = os.path.basename(os.getcwd())
@@ -134,19 +135,47 @@ LOG_PATH = "log/%s/%s.log" %(DTIME,LOG_NAME)  # log存储路径
 LOG_LEVEL = "INFO"
 LOG_LEVEL = "INFO"
 LOG_COLOR = True  # 是否带有颜色
 LOG_COLOR = True  # 是否带有颜色
 LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
 LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
-LOG_IS_WRITE_TO_FILE = False  # 是否写文件
-LOG_MODE = "w"  # 写文件的模式
+# LOG_IS_WRITE_TO_FILE = True  # 是否写文件
+# LOG_MODE = "w"  # 写文件的模式
 LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
 LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
 LOG_BACKUP_COUNT = 20  # 日志文件保留数量
 LOG_BACKUP_COUNT = 20  # 日志文件保留数量
 LOG_ENCODING = "utf8"  # 日志文件编码
 LOG_ENCODING = "utf8"  # 日志文件编码
-OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
+OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级 一般用不到
 #
 #
 # # 切换工作路径为当前项目路径
 # # 切换工作路径为当前项目路径
 # project_path = os.path.abspath(os.path.dirname(__file__))
 # project_path = os.path.abspath(os.path.dirname(__file__))
 # os.chdir(project_path)  # 切换工作路经
 # os.chdir(project_path)  # 切换工作路经
 # sys.path.insert(0, project_path)
 # sys.path.insert(0, project_path)
 # print('当前工作路径为 ' + os.getcwd())
 # print('当前工作路径为 ' + os.getcwd())
+
+# 代理服务-未解析的
 jy_proxy = {'socks5': {'url': 'http://socks.spdata.jianyu360.com/socks/getips?limit=100', 'decrypt': 'ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/'}}
 jy_proxy = {'socks5': {'url': 'http://socks.spdata.jianyu360.com/socks/getips?limit=100', 'decrypt': 'ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/'}}
+
 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', 'Accept': '*/*'}
 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', 'Accept': '*/*'}
-oss_={'key_id': 'LTAI4G5x9aoZx8dDamQ7vfZi', 'key_secret': 'Bk98FsbPYXcJe72n1bG3Ssf73acuNh', 'endpoint': 'oss-cn-beijing-internal.aliyuncs.com', 'bucket_name': 'jy-datafile'}
-author = {"dzr":"董钊瑞",'mgp':"马国鹏","lzz":"李宗泽"}
+
+# 文件存储功能的配置信息
+oss_={'key_id': 'LTAI4G5x9aoZx8dDamQ7vfZi', 'key_secret': 'Bk98FsbPYXcJe72n1bG3Ssf73acuNh',
+      'endpoint': 'oss-cn-beijing.aliyuncs.com', 'bucket_name': 'jy-datafile'}
+# oss_={'key_id': 'LTAI4G5x9aoZx8dDamQ7vfZi', 'key_secret': 'Bk98FsbPYXcJe72n1bG3Ssf73acuNh', 'endpoint': 'oss-cn-beijing-internal.aliyuncs.com', 'bucket_name': 'jy-editor'}
+
+author = {"dzr":"董钊瑞",'mgp':"马国鹏","lzz":"李宗泽"}
+
+# 线上代理服务的api地址
+JIANYU_PROXY_URL = 'http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch'
+JIANYU_PROXY_AUTHOR = 'Basic amlhbnl1MDAxOjEyM3F3ZSFB'
+
+# splash 渲染服务的api接口配置
+JIANYU_SPLASH_URL = 'http://8.131.72.226:8998/render.json'
+
+# 测试环境的redis集群 -- url去重专用
+REDISCLUSTER =  [
+                {"host": "192.168.3.207", "port": "2179"},
+                {"host": "192.168.3.166", "port": "2379"}
+            ]
+
+# 正式环境的redis集群 -- url去重专用
+# REDISCLUSTER =  [
+#                 {"host": "172.17.4.239", "port": "2479"},
+#                 {"host": "172.17.4.240", "port": "2579"},
+#                 {"host": "172.17.4.84", "port": "2379"}
+#             ]

+ 141 - 95
FworkSpider/untils/attachment.py

@@ -1,72 +1,22 @@
 import hashlib
 import hashlib
 import os
 import os
-import re
+import sys
 import traceback
 import traceback
 import uuid
 import uuid
-from urllib.parse import urlparse, unquote
-
+from urllib import request
 import requests
 import requests
 import urllib3
 import urllib3
-
 from feapder.setting import headers
 from feapder.setting import headers
 from untils.execptions import AttachmentNullError
 from untils.execptions import AttachmentNullError
 from untils.aliyun import AliYunService
 from untils.aliyun import AliYunService
 from untils.proxy_pool import ProxyPool
 from untils.proxy_pool import ProxyPool
-
+import time
+import tqdm
 urllib3.disable_warnings()
 urllib3.disable_warnings()
-
-
-def hex_sha1(val):
-    sha1 = hashlib.sha1()
-    if isinstance(val, bytes):
-        sha1.update(str(val).encode("utf-8"))
-    elif isinstance(val, str):
-        sha1.update(val.encode("utf-8"))
-    res = sha1.hexdigest()
-    return res
-
-
-def extract_file_type(text):
-    if text is None:
-        return None
-
-    file_types = {
-        'pdf', 'doc', 'docx', 'rar', 'zip', 'gzzb', 'jpg', 'png'
-    }
-    for file_type in file_types:
-        tmp = [file_type, file_type.upper()]
-        for t in tmp:
-            result = re.match(f'.*{t}$', text, re.S)
-            if result is not None:
-                return t
-    else:
-        return None
-
-
-def extract_file_name(href: str, file_type: str):
-    # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
-    # 中文字符:[\u4e00 -\u9fa5]
-    zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
-    parser = urlparse(href)
-    query = (parser.query or parser.path)
-    result = re.search(f'.*\\.{file_type}', query, re.S)
-    if result is not None:
-        encode_str = unquote(result.group())
-        name = re.search(zh_char_pattern, encode_str)
-        if name is not None:
-            return unquote(name.group())
-    return None
-
-
-def verify_file_name(name):
-    if extract_file_type(name) is None:
-        raise ValueError
-
-
 class AttachmentDownloader:
 class AttachmentDownloader:
-
+    '''附件下载模块'''
     def __init__(self):
     def __init__(self):
-        self.dir_name = '/file'
+        self.dir_name = 'file'
 
 
     def create_dir(self):
     def create_dir(self):
         if not os.path.exists(self.dir_name):
         if not os.path.exists(self.dir_name):
@@ -74,13 +24,52 @@ class AttachmentDownloader:
 
 
     def create_file_path(self, filename, file_type):
     def create_file_path(self, filename, file_type):
         self.create_dir()
         self.create_dir()
-        sign = hex_sha1("{}_{}".format(filename, uuid.uuid4()))
+        sign = self.hex_sha1("{}_{}".format(filename, uuid.uuid4()))
         tmp_name = "{}.{}".format(sign, file_type)
         tmp_name = "{}.{}".format(sign, file_type)
         return "{}/{}".format(self.dir_name, tmp_name)
         return "{}/{}".format(self.dir_name, tmp_name)
 
 
+    def hex_sha1(self,val):
+        sha1 = hashlib.sha1()
+        if isinstance(val, bytes):
+            sha1.update(str(val).encode("utf-8"))
+        elif isinstance(val, str):
+            sha1.update(val.encode("utf-8"))
+        res = sha1.hexdigest()
+        return res
+
     @staticmethod
     @staticmethod
     def create_fid(file_stream: bytes):
     def create_fid(file_stream: bytes):
-        return hex_sha1(file_stream)
+        sha1 = hashlib.sha1()
+        if isinstance(file_stream, bytes):
+            sha1.update(str(file_stream).encode("utf-8"))
+        elif isinstance(file_stream, str):
+            sha1.update(file_stream.encode("utf-8"))
+        res = sha1.hexdigest()
+        return res
+
+
+    @staticmethod
+    def clean_attachment(file_path):
+        os.remove(file_path)
+
+    @staticmethod
+    def getsize(file_path: str):
+        def _getsize(filename):
+            try:
+                return os.path.getsize(filename)
+            except:
+                return 0
+
+        _kb = float(_getsize(file_path)) / 1024
+        if _kb >= 1024:
+            _M = _kb / 1024
+            if _M >= 1024:
+                _G = _M / 1024
+                return "{:.1f} G".format(_G)
+            else:
+                return "{:.1f} M".format(_M)
+        else:
+            return "{:.1f} kb".format(_kb)
 
 
     @staticmethod
     @staticmethod
     def _fetch_attachment(
     def _fetch_attachment(
@@ -94,20 +83,28 @@ class AttachmentDownloader:
         request_params.setdefault('headers', kwargs.get('headers') or headers)
         request_params.setdefault('headers', kwargs.get('headers') or headers)
         request_params.setdefault('proxies', kwargs.get('proxies'))
         request_params.setdefault('proxies', kwargs.get('proxies'))
         request_params.setdefault('timeout', kwargs.get('timeout') or 60)
         request_params.setdefault('timeout', kwargs.get('timeout') or 60)
-        request_params.setdefault('stream', kwargs.get('stream') or True)
+        # request_params.setdefault('stream', kwargs.get('stream') or True)
         request_params.setdefault('verify', kwargs.get('verify') or False)
         request_params.setdefault('verify', kwargs.get('verify') or False)
         if enable_proxy:
         if enable_proxy:
-            proxy = ProxyPool()
+            proxy = ProxyPool().get()
         else:
         else:
             proxy = {}
             proxy = {}
         retries = 0
         retries = 0
         while retries < 3:
         while retries < 3:
             try:
             try:
-                with requests.get(url, **request_params) as req:
+                with requests.get(url,stream=True, **request_params) as req:
+                    content_size = req.headers.get('Content-Length') or 0
+                    content_size = int(content_size)
+                    stream = b''
                     if req.status_code == 200:
                     if req.status_code == 200:
-                        stream = req.content
                         with open(file_path, 'wb') as f:
                         with open(file_path, 'wb') as f:
-                            f.write(stream)
+                            with tqdm.tqdm(total=content_size, unit='B', initial=0, unit_scale=True, unit_divisor=1024,
+                                      ascii=True,desc=file_path) as bar:
+                                for chunk in req.iter_content(chunk_size=1024*20):
+                                    if chunk:
+                                        f.write(chunk)
+                                    stream += chunk
+                                    bar.update(len(chunk))
                         return stream
                         return stream
                     else:
                     else:
                         retries += 1
                         retries += 1
@@ -115,33 +112,10 @@ class AttachmentDownloader:
                 if allow_show_exception:
                 if allow_show_exception:
                     traceback.print_exc()
                     traceback.print_exc()
                 if enable_proxy:
                 if enable_proxy:
-                    request_params.update({'proxies': proxy.get()})
+                    request_params.update({'proxies': ProxyPool().get()})
                 retries += 1
                 retries += 1
         return b''
         return b''
 
 
-    @staticmethod
-    def clean_attachment(file_path):
-        os.remove(file_path)
-
-    @staticmethod
-    def getsize(file_path: str):
-        def _getsize(filename):
-            try:
-                return os.path.getsize(filename)
-            except:
-                return 0
-
-        _kb = float(_getsize(file_path)) / 1024
-        if _kb >= 1024:
-            _M = _kb / 1024
-            if _M >= 1024:
-                _G = _M / 1024
-                return "{:.1f} G".format(_G)
-            else:
-                return "{:.1f} M".format(_M)
-        else:
-            return "{:.1f} kb".format(_kb)
-
     def fetch_attachment(
     def fetch_attachment(
             self,
             self,
             file_name: str,
             file_name: str,
@@ -153,7 +127,6 @@ class AttachmentDownloader:
     ):
     ):
         if not file_name or not file_type or not download_url:
         if not file_name or not file_type or not download_url:
             raise AttachmentNullError
             raise AttachmentNullError
-
         file_path = self.create_file_path(file_name, file_type)
         file_path = self.create_file_path(file_name, file_type)
         file_stream = self._fetch_attachment(
         file_stream = self._fetch_attachment(
             download_url,
             download_url,
@@ -162,6 +135,7 @@ class AttachmentDownloader:
             allow_request_exception,
             allow_request_exception,
             **kwargs
             **kwargs
         )
         )
+        # file_stream = self.download_file(download_url,file_path,enable_proxy,allow_request_exception)
         if len(file_stream) > 0:
         if len(file_stream) > 0:
             fid = self.create_fid(file_stream)
             fid = self.create_fid(file_stream)
             '''上传/下载,无论失败成功都需要给出文件基础信息'''
             '''上传/下载,无论失败成功都需要给出文件基础信息'''
@@ -188,11 +162,83 @@ class AttachmentDownloader:
             }
             }
         return result
         return result
 
 
+    def download_file(self, url, file_path, call_func=None,enable_proxy=False,data=None):
+        """
+        Args:
+            url: 地址
+            file_path: 文件存储地址
+            call_func: 下载成功的回调
+        Returns:
+        """
+        # proxies = kwargs.get('proxies') or None
+        # data = kwargs.get('data') or None
+        start_time = time.time()
+        def progress_callfunc(blocknum, blocksize, totalsize):
+            """回调函数
+            @blocknum : 已经下载的数据块
+            @blocksize : 数据块的大小
+            @totalsize: 远程文件的大小
+            """
+            speed = (blocknum * blocksize) / (time.time() - start_time)
+            # speed_str = " Speed: %.2f" % speed
+            speed_str = " Speed: %s" % format_size(speed)
+            recv_size = blocknum * blocksize
+
+            # 设置下载进度条
+            f = sys.stdout
+            pervent = recv_size / totalsize
+            percent_str = "%.2f%%" % (pervent * 100)
+            n = round(pervent * 50)
+            s = ('#' * n).ljust(50, '-')
+            f.write(percent_str.ljust(8, ' ') + '[' + s + ']' + speed_str)
+            f.flush()
+            f.write('\r')
+
+        def format_size(bytes):
+            try:
+                bytes = float(bytes)
+                kb = bytes / 1024
+            except:
+                print("传入的字节格式不对")
+                return "Error"
+            if kb >= 1024:
+                M = kb / 1024
+                if M >= 1024:
+                    G = M / 1024
+                    return "%.3fG" % (G)
+                else:
+                    return "%.3fM" % (M)
+            else:
+                return "%.3fK" % (kb)
+
+        if url:
+            try:
+                if enable_proxy:
+                    proxies = ProxyPool().get()
+                    # create the object, assign it to a variable
+                    proxy = request.ProxyHandler(proxies)
+                    # construct a new opener using your proxy settings
+                    opener = request.build_opener(proxy)
+                    # install the openen on the module-level
+                    request.install_opener(opener)
+                # 测试可以打开进度条,生产环境禁用进度条
+                filename, headers = request.urlretrieve(url, file_path, progress_callfunc, data)
+                # filename, headers = request.urlretrieve(url, file_path, data)
+                print(filename,headers)
+
+                if callable(call_func):
+                    call_func()
+                return filename
+            except Exception as e:
+                print(e)
+                return ''
+        else:
+            return ''
+
+if __name__ == '__main__':
 
 
-# if __name__ == '__main__':
-    # a = AttachmentDownloader().fetch_attachment(
-    #     file_name='成建制移民村(五标段)合同',
-    #     file_type='pdf',
-    #     download_url='http://222.75.70.90/NXGPPSP_MG/downloadFileServlet?req=F&num=8b80b23f7e729b88017e758a1b03422c'
-    # )
-    # print(a)
+    url = 'https://gdgpo.czt.gd.gov.cn/gpx-bid-file/440606/gpx-tender/2022/5/9/8a7e15d780a438400180a6be91e90cb2.zip?accessCode=0cf1d12a48345bcb7e64ac9583e30207'
+    attachment = AttachmentDownloader().fetch_attachment(
+        file_name="file_name", file_type="pdf", download_url=url,
+        enable_proxy=False)
+    print(attachment)

+ 136 - 0
FworkSpider/untils/cleaner.py

@@ -0,0 +1,136 @@
+import re
+__all__ = ['cleaner']
+
+# 独立元素
+INDEPENDENT_TAGS = {
+    '<head>[\s\S]*?</head>': '',
+    '<html>|<html [^>]*>|</html>': '',
+    '<body>|<body [^>]*>|</body>': '',
+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
+    '\\xa0|\\u3000': '',  # 空格
+    '<!--[\s\S]*?-->': '',  # 注释
+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
+    '<input>': '',  # 输入框
+    '<img[^>]*>': '<br>',  # 图片
+}
+# 行内元素
+INLINE_TAGS = {
+    '<a>|<a [^>]*>|</a>': '',  # 超链接
+    '<link>|<link [^>]*>|</link>': '',  # 超链接
+    '<span>|<span [^>]*>|</span>': '',  # span
+    '<label>|<label [^>]*>|</label>': '<br>',  # label
+    '<font>|<font [^>]*>|</font>': '',  # font
+}
+# 块级元素
+BLOCK_TAGS = {
+    '<div>\s*?</div>':'',
+    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
+    '<p>|<p [^>]*>': '<br>',  # 段落
+    '</p>': '',  # 段落
+    '<div>|<div [^>]*>': '<br>',  # 分割 division
+    '</div>': '',  # 分割 division
+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
+}
+# 其他
+OTHER = {
+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
+    '<epointform>': '',
+    '<!doctype html>|<!doctype html [^>]*>': '',
+    '【关闭】|关闭': '',
+    '【打印】|打印本页': '',
+    '【字体:[\s\S]*】': '',
+    '文章来源:[\u4e00-\u9fa5]+': '',
+    '浏览次数:.*[<]+': '',
+    '(责任编辑:.*?)': '',
+    '分享到[:]': '',
+
+}
+# 样式
+CSS_STYLE = {
+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
+
+}
+# 空白符
+BLANKS = {
+    '\n\s*\n': '\n',
+    '\s*\n\s*': '\n',
+    '[^\S\n]': ' ',
+    '\s+': ' ',
+}
+# css标签集合
+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
+# css属性集合
+ATTRS = {'id', 'class', 'style', 'width'}
+
+
+def _repair_tag():
+    """异常的标签组合,用来替换非标准页面的标签"""
+    _repairs = {}
+    for tag in TAGS:
+        for attr in ATTRS:
+            key = '{}{}'.format(tag, attr)
+            val = '{} {}'.format(tag, attr)
+            _repairs[key] = val
+    return _repairs
+
+
+def _escape_character(html):
+    """转义字符"""
+    html = html.replace('&lt;', '<')
+    html = html.replace('&gt;', '>')
+    html = html.replace('&quot;', '"')
+    html = html.replace('&amp;', '&')
+    return html
+
+
+def _lowercase_tag(html):
+    """标签归一化处理(全部小写)"""
+    tags = re.findall("<[^>]+>", html)
+    for tag in tags:
+        html = html.replace(tag, str(tag).lower())
+
+    repair_tags = _repair_tag()
+    for err, right in repair_tags.items():
+        html = html.replace(err, right)
+
+    return html
+
+
+def cleaner(html, special=None, completely=False):
+    """
+    数据清洗
+
+    :param html: 清洗的页面
+    :param special: 额外指定页面清洗规则
+    :param completely: 是否完全清洗页面
+    :return: 清洗后的页面源码
+    """
+    if special is None:
+        special = {}
+    OTHER.update(special)
+    remove_tags = {
+        **INDEPENDENT_TAGS,
+        **INLINE_TAGS,
+        **BLOCK_TAGS,
+        **OTHER,
+        **CSS_STYLE,
+        **BLANKS,
+    }
+    html = _lowercase_tag(html)
+    for tag, repl in remove_tags.items():
+        html = re.sub(tag, repl, html)
+
+    if completely:
+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
+
+    html = _escape_character(html)
+    return html

+ 3 - 2
FworkSpider/untils/cookie_pool.py

@@ -131,6 +131,7 @@ class PageCookiePool(CookiePoolInterface):
                         )
                         )
                     )
                     )
                     try:
                     try:
+                        print('????')
                         cookies = self.create_cookie()
                         cookies = self.create_cookie()
                         if cookies:
                         if cookies:
                             self.add_cookies(cookies)
                             self.add_cookies(cookies)
@@ -178,7 +179,7 @@ class PageCookiePool(CookiePoolInterface):
             try:
             try:
                 cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool)
                 cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool)
                 if not cookie_info and wait_when_null:
                 if not cookie_info and wait_when_null:
-                    log.info("暂无cookie 生产中..."+self._tab_cookie_pool)
+                    log.info("暂无cookie 生产中...")
                     self._keep_alive = False
                     self._keep_alive = False
                     self._min_cookies = 1
                     self._min_cookies = 1
                     with RedisLock(
                     with RedisLock(
@@ -291,7 +292,7 @@ class LoginCookiePool(CookiePoolInterface):
             try:
             try:
                 user_cookie = self._redisdb.rpoplpush(self._tab_cookie_pool)
                 user_cookie = self._redisdb.rpoplpush(self._tab_cookie_pool)
                 if not user_cookie and wait_when_null:
                 if not user_cookie and wait_when_null:
-                    log.info("暂无cookie 生产中..."+self._tab_cookie_pool)
+                    log.info("暂无cookie 生产中...")
                     self.login()
                     self.login()
                     continue
                     continue
 
 

+ 1 - 1
FworkSpider/untils/create_menus.py

@@ -19,7 +19,7 @@ class Details:
         return self._to_db_xs
         return self._to_db_xs
     def main(self,page):
     def main(self,page):
         menus_list = []
         menus_list = []
-        data = self.to_db_xs.find("luaconfig",{"modifyuser":"maguopeng","param_common":{"$elemMatch": {"$regex": "中国南方航空采购招标网", "$options": "$i"}}})
+        data = self.to_db_xs.find("luaconfig",{"modifyuser":"maguopeng","param_common":{"$elemMatch": {"$regex": "广东省政府采购网", "$options": "$i"}}})
         # print(data)
         # print(data)
         for item in data:
         for item in data:
             # print(item)
             # print(item)

+ 7 - 0
FworkSpider/untils/get_imgcode.py

@@ -12,3 +12,10 @@ def get_code(file_path: str) -> dict:
     response = requests.post(upload_address, headers=headers, files=content, stream=True)
     response = requests.post(upload_address, headers=headers, files=content, stream=True)
     return response.json()
     return response.json()
 
 
+def get_code_det(image_bytes) -> dict:
+   upload_address = "http://123.57.163.80:2119/v1/images/verify_det"
+   content = {'image_content': image_bytes}
+   headers = {'accept': 'application/json'}
+   response = requests.post(upload_address, headers=headers, files=content, stream=True)
+   return response.json()
+

+ 55 - 89
FworkSpider/untils/tools.py

@@ -7,82 +7,15 @@ from setting import WECHAT_WARNING_URL,WECHAT_WARNING_PHONE,WARNING_INTERVAL,WEC
 import bson
 import bson
 from feapder.utils.log import log
 from feapder.utils.log import log
 from feapder.db.mongodb import MongoDB
 from feapder.db.mongodb import MongoDB
-
+from .cleaner import cleaner
+import sys
 
 
 SearchText = namedtuple('SearchText', ['total'])
 SearchText = namedtuple('SearchText', ['total'])
 
 
 
 
-def substitute(html_str):
+def substitute(html_str,special=None, completely=False):
     """HTML 替换"""
     """HTML 替换"""
-    patterns = {
-        '<!--.*?-->': '',
-        '"': "'",
-        '\n': '',
-        '\xa0': "",
-        '<span .*?>': '',
-        '</span> ': '',
-        '</span>': '',
-        '<span>': '',
-        '<p.*?>': '<br>',
-        '</p>': '<br>',
-        '<div>': '<br>',
-        '<div .*?>': '<br>',
-        '</div>': '<br>',
-        '<img .*?>': '<br>',
-        '<style.*?</style>': '',
-        '<EpointForm>': '',
-        '<html.*?</head>': '',
-        '<input .*?>': '',
-        '<!DOCTYPE.*?>': '',
-        '</meta>': '',
-        '<?xml:.*?>': '',
-        '<label.*?>': '<br>',
-        '</label>': '',
-        'style=".*?"': '',
-        "style='.*?'": '',
-        'class=".*?"': '',
-        "class='.*?'": '',
-        "align='.*?'": '',
-        'align=".*?"': '',
-        'border=".*?"': '',
-        "border='.*?'": '',
-        'cellpadding=".*?"': '',
-        "cellpadding='.*?'": '',
-        'cellspacing=".*?"': '',
-        "cellspacing='.*?'": '',
-        'center=".*?"': '',
-        "center='.*?'": '',
-        'width=".*?"': '',
-        "width='.*?'": '',
-        "bordercolor='.*?'": '',
-        'bgcolor=".*?"': '',
-        'BORDERCOLOR=".*?"': '',
-        '<a name=".*?">': '',
-        '<o:p>': '',
-        '</o:p>': '',
-        '<A name=.*?>': '',
-        '<a .*?>': '',
-        '</a>': '',
-        '<font .*?>': '',
-        '</font>': '',
-        '<body.*?>': '',
-        '</body>': '',
-        '<script.*?>': '',
-        '</script>': '',
-        '【关闭】': '',
-        '【打印】': '',
-        'function .*?() ': '',
-        'var .*?;': '',
-        'if .*?\)': '',
-        '{[^{}]+}': '',
-        '{.*?}': '',
-    }
-
-    def substitutes(k, v, c):
-        return re.sub(k, v, c)
-
-    for k, v in patterns.items():
-        html_str = re.sub(k, v, substitutes(k, v, html_str), re.S, re.M)
+    html_str = cleaner(html=html_str,special=None, completely=False)
     return html_str
     return html_str
 
 
 
 
@@ -188,6 +121,15 @@ class CustomCheckError(JyBasicException):
         self.err_details = kwargs
         self.err_details = kwargs
         for key, val in kwargs.items():
         for key, val in kwargs.items():
             setattr(self, key, val)
             setattr(self, key, val)
+class HtmlEmptyError(JyBasicException):
+
+    def __init__(self, code: int = 10002, reason: str = '正文获取异常,正文为空', **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
 class CheckPrePareRequest:
 class CheckPrePareRequest:
 
 
     def __init__(self):
     def __init__(self):
@@ -200,33 +142,57 @@ class CheckPrePareRequest:
             '终止', '系统'
             '终止', '系统'
         }
         }
 
 
-    @staticmethod
-    def check_es_cache(title: str, publish_time: int, rows: dict):
-        """
-
-        :param title:  标题
-        :param publish_time: 发布时间的时间戳(l_np_publishtime)
-        :param rows: 采集内容
-        """
-        pass
-        # retrieved_result = es_query(title, publish_time)
-        # if retrieved_result != 0:
-        #     '''es查询数据结果'''
-        #     rows['count'] = retrieved_result
-        #     raise CustomCheckError(code=10105, reason='标题内容已存在es')
-
     def check_crawl_title(self, title: str):
     def check_crawl_title(self, title: str):
         for keyword in self.crawl_keywords:
         for keyword in self.crawl_keywords:
             valid_keyword = re.search(keyword, title)
             valid_keyword = re.search(keyword, title)
             if valid_keyword is not None:
             if valid_keyword is not None:
                 break
                 break
         else:
         else:
-            raise CustomCheckError(code=10106, reason='标题未检索到采集关键词', title=title)
+            # raise CustomCheckError(code=10106, reason='标题未检索到采集关键词', title=title)
+            return 10106,'标题未检索到采集关键词'
+        return 200,'ok'
+
 
 
     def __check(self, rows: dict):
     def __check(self, rows: dict):
         title, publish_time = rows['title'], rows['l_np_publishtime']
         title, publish_time = rows['title'], rows['l_np_publishtime']
         self.check_crawl_title(title)
         self.check_crawl_title(title)
-        self.check_es_cache(title, publish_time, rows)
 
 
     def __call__(self, rows: dict, *args, **kwargs):
     def __call__(self, rows: dict, *args, **kwargs):
-        self.__check(rows)
+        self.__check(rows)
+
+def get_proxy():
+    headers = {
+        "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
+    }
+    proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
+    print(f"切换代理:{proxy.get('data')}")
+    return proxy.get("data").get("http")
+import json
+
+class Obj(object):
+    def __init__(self, dict_):
+        self.__dict__.update(dict_)
+
+def get_argvs():
+    argvs = {"next_page":False,"max_page":10}
+    for item in sys.argv[1:]:
+        print(item)
+        if item.startswith("--"):
+            argvs[item.replace("--", "").split('=')[0]] = int(item.split('=')[-1])
+    return json.loads(json.dumps(argvs), object_hook=Obj)
+
+def search(pattern, string):
+    result = re.search(pattern, string)
+    if result:
+        return result.groups()[0]
+
+def search_construction(string):
+    result = re.search('pattern', string)
+    if result:
+        return result.groups()[0]
+
+def search_floor(string):
+    result = re.search('pattern', string)
+    if result:
+        return result.groups()[0]
+

+ 0 - 23
NoteWork/cesspider/__init__.py

@@ -1,23 +0,0 @@
-__all__ = [
-    "ces",
-    "css",
-    "cssw",
-    "demo",
-    "down_ces",
-    "example",
-    "hubeijianzhu",
-    "jiangxistouces",
-    "magpces",
-    "qyzcdzzbcgjypt",
-    "中国南方电网电子采购交易平台",
-    "交通银行供应商门户",
-    "华创迅采电子采购平台",
-    "国家税务总局宁波市税务局",
-    "城轨采购网",
-    "山西省招标投标协会",
-    "测试查询",
-    "甘肃政府采购网",
-    "甘肃政府采购网_ces",
-    "甘肃政府采购网_new",
-    "福建省政府采购网"
-]

+ 0 - 247
NoteWork/cesspider/cesspider

@@ -1,247 +0,0 @@
-Thread-5|2022-01-24 09:41:46,749|parser_control.py|run|line:56|DEBUG| parser 等待任务...
-Zglbsww|2022-01-24 09:41:46,754|scheduler.py|<lambda>|line:112|INFO| 
-********** feapder begin **********
-Thread-4|2022-01-24 09:41:46,758|collector.py|__input_data|line:108|INFO| 重置丢失任务完毕,共8条
-Zglbsww|2022-01-24 09:41:46,766|scheduler.py|__add_task|line:215|INFO| 检查到有待做任务 8 条,不重下发新任务,将接着上回异常终止处继续抓取
-Thread-5|2022-01-24 09:41:47,763|request.py|get_response|line:305|DEBUG| 
-                -------------- Zglbsww.parse request for ----------------
-                url  = https://eproport.crecgec.com/epu-portal/portal/project/listWithPage
-                method = POST
-                body = {'proxies': False, 'data': '{"timeType": "month", "areaCode": "-1", "mainType": "-1", "purchaser": null, "information": null, "sTime": "", "eTime": "", "classify": "-1", "region": "-1", "level": "", "selectedState": "", "purchaseType": "-1", "noticeType": 1, "orders": "publish_time", "dirs": "desc", "current": 1, "size": 10, "page": {}}', 'headers': {'Content-Type': 'application/json', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36'}, 'timeout': 22, 'stream': True, 'verify': False}
-                
-Thread-3|2022-01-24 09:42:19,071|item_buffer.py|__add_item_to_db|line:300|DEBUG| 
-                -------------- item 批量入库 --------------
-                表名: mgp_list
-                datas: [
-                {
-                                "parse": "self.detail_get",
-                                "item": {
-                                                "title": "中铁七局电务公司武汉北III场扩能改造工程电力电缆询价文件",
-                                                "publishtime": "2022-01-24 09:40:00",
-                                                "spidercode": "a_ztlbsww_jzxtp",
-                                                "site": "中铁鲁班商务网",
-                                                "channel": "采购公告-竞争性谈判",
-                                                "area": "全国",
-                                                "city": "",
-                                                "competehref": null,
-                                                "href": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485414971692441602&tenantId=1",
-                                                "publishdept": "",
-                                                "iscompete": true,
-                                                "type": "",
-                                                "T": "bidding",
-                                                "l_np_publishtime": "",
-                                                "comeintime": "",
-                                                "sendflag": "false",
-                                                "_d": "comeintime",
-                                                "contenthtml": "",
-                                                "detail": "",
-                                                "projectinfo": null
-                                },
-                                "parser_name": "details_ztlbw",
-                                "date": "2022-01-24 09:41:48",
-                                "deal_detail": [
-                                                "//*"
-                                ],
-                                "create_time": null,
-                                "parse_url": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485414971692441602&tenantId=1",
-                                "request_params": {},
-                                "failed": 0,
-                                "author": null,
-                                "ex_js": "",
-                                "ex_python": null,
-                                "pri": 1,
-                                "proxies": false,
-                                "files": {
-                                                "list_xpath": "//div[@class=\"****\"]/a",
-                                                "url_xpath": "./@href",
-                                                "name_xpath": "./text()",
-                                                "files_type": [
-                                                                "zip",
-                                                                "doxc",
-                                                                "ftp"
-                                                ],
-                                                "url_key": "http"
-                                },
-                                "error": null,
-                                "render_time": 3
-                }
-]
-                    
-Thread-3|2022-01-24 09:42:19,122|mongo_pipeline.py|save_items|line:49|INFO| 共导出 1 条数据到 mgp_list,  新增 1条, 重复 0 条
-Thread-3|2022-01-24 09:42:50,355|item_buffer.py|__add_item_to_db|line:300|DEBUG| 
-                -------------- item 批量入库 --------------
-                表名: mgp_list
-                datas: [
-                {
-                                "parse": "self.detail_get",
-                                "item": {
-                                                "title": "中铁七局电务公司武汉北III场扩能改造工程SMC电缆沟支架询价文件",
-                                                "publishtime": "2022-01-24 09:39:00",
-                                                "spidercode": "a_ztlbsww_jzxtp",
-                                                "site": "中铁鲁班商务网",
-                                                "channel": "采购公告-竞争性谈判",
-                                                "area": "全国",
-                                                "city": "",
-                                                "competehref": null,
-                                                "href": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485423048057802753&tenantId=1",
-                                                "publishdept": "",
-                                                "iscompete": true,
-                                                "type": "",
-                                                "T": "bidding",
-                                                "l_np_publishtime": "",
-                                                "comeintime": "",
-                                                "sendflag": "false",
-                                                "_d": "comeintime",
-                                                "contenthtml": "",
-                                                "detail": "",
-                                                "projectinfo": null
-                                },
-                                "parser_name": "details_ztlbw",
-                                "date": "2022-01-24 09:42:18",
-                                "deal_detail": [
-                                                "//*"
-                                ],
-                                "create_time": null,
-                                "parse_url": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485423048057802753&tenantId=1",
-                                "request_params": {},
-                                "failed": 0,
-                                "author": null,
-                                "ex_js": "",
-                                "ex_python": null,
-                                "pri": 1,
-                                "proxies": false,
-                                "files": {
-                                                "list_xpath": "//div[@class=\"****\"]/a",
-                                                "url_xpath": "./@href",
-                                                "name_xpath": "./text()",
-                                                "files_type": [
-                                                                "zip",
-                                                                "doxc",
-                                                                "ftp"
-                                                ],
-                                                "url_key": "http"
-                                },
-                                "error": null,
-                                "render_time": 3
-                }
-]
-                    
-Thread-3|2022-01-24 09:42:50,411|mongo_pipeline.py|save_items|line:49|INFO| 共导出 1 条数据到 mgp_list,  新增 1条, 重复 0 条
-Thread-3|2022-01-24 09:43:21,545|item_buffer.py|__add_item_to_db|line:300|DEBUG| 
-                -------------- item 批量入库 --------------
-                表名: mgp_list
-                datas: [
-                {
-                                "parse": "self.detail_get",
-                                "item": {
-                                                "title": "轨道交通B1线项目砂浆采购询价书",
-                                                "publishtime": "2022-01-24 09:39:00",
-                                                "spidercode": "a_ztlbsww_jzxtp",
-                                                "site": "中铁鲁班商务网",
-                                                "channel": "采购公告-竞争性谈判",
-                                                "area": "全国",
-                                                "city": "",
-                                                "competehref": null,
-                                                "href": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485182479012319233&tenantId=1",
-                                                "publishdept": "",
-                                                "iscompete": true,
-                                                "type": "",
-                                                "T": "bidding",
-                                                "l_np_publishtime": "",
-                                                "comeintime": "",
-                                                "sendflag": "false",
-                                                "_d": "comeintime",
-                                                "contenthtml": "",
-                                                "detail": "",
-                                                "projectinfo": null
-                                },
-                                "parser_name": "details_ztlbw",
-                                "date": "2022-01-24 09:42:48",
-                                "deal_detail": [
-                                                "//*"
-                                ],
-                                "create_time": null,
-                                "parse_url": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485182479012319233&tenantId=1",
-                                "request_params": {},
-                                "failed": 0,
-                                "author": null,
-                                "ex_js": "",
-                                "ex_python": null,
-                                "pri": 1,
-                                "proxies": false,
-                                "files": {
-                                                "list_xpath": "//div[@class=\"****\"]/a",
-                                                "url_xpath": "./@href",
-                                                "name_xpath": "./text()",
-                                                "files_type": [
-                                                                "zip",
-                                                                "doxc",
-                                                                "ftp"
-                                                ],
-                                                "url_key": "http"
-                                },
-                                "error": null,
-                                "render_time": 3
-                }
-]
-                    
-Thread-3|2022-01-24 09:43:21,575|mongo_pipeline.py|save_items|line:49|INFO| 共导出 1 条数据到 mgp_list,  新增 1条, 重复 0 条
-Thread-3|2022-01-24 09:43:52,756|item_buffer.py|__add_item_to_db|line:300|DEBUG| 
-                -------------- item 批量入库 --------------
-                表名: mgp_list
-                datas: [
-                {
-                                "parse": "self.detail_get",
-                                "item": {
-                                                "title": "中铁九局集团第四工程有限公司沈阳地铁6号线一期工程土建施工第三合同段项目经理部玻璃纤维筋询价采购",
-                                                "publishtime": "2022-01-24 09:34:00",
-                                                "spidercode": "a_ztlbsww_jzxtp",
-                                                "site": "中铁鲁班商务网",
-                                                "channel": "采购公告-竞争性谈判",
-                                                "area": "全国",
-                                                "city": "",
-                                                "competehref": null,
-                                                "href": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485421040418316289&tenantId=1",
-                                                "publishdept": "",
-                                                "iscompete": true,
-                                                "type": "",
-                                                "T": "bidding",
-                                                "l_np_publishtime": "",
-                                                "comeintime": "",
-                                                "sendflag": "false",
-                                                "_d": "comeintime",
-                                                "contenthtml": "",
-                                                "detail": "",
-                                                "projectinfo": null
-                                },
-                                "parser_name": "details_ztlbw",
-                                "date": "2022-01-24 09:43:18",
-                                "deal_detail": [
-                                                "//*"
-                                ],
-                                "create_time": null,
-                                "parse_url": "https://eproport.crecgec.com/#/notice/noticexj-detail?projectId=1485421040418316289&tenantId=1",
-                                "request_params": {},
-                                "failed": 0,
-                                "author": null,
-                                "ex_js": "",
-                                "ex_python": null,
-                                "pri": 1,
-                                "proxies": false,
-                                "files": {
-                                                "list_xpath": "//div[@class=\"****\"]/a",
-                                                "url_xpath": "./@href",
-                                                "name_xpath": "./text()",
-                                                "files_type": [
-                                                                "zip",
-                                                                "doxc",
-                                                                "ftp"
-                                                ],
-                                                "url_key": "http"
-                                },
-                                "error": null,
-                                "render_time": 3
-                }
-]
-                    
-Thread-3|2022-01-24 09:43:52,776|mongo_pipeline.py|save_items|line:49|INFO| 共导出 1 条数据到 mgp_list,  新增 1条, 重复 0 条

+ 0 - 6
NoteWork/cesspider/hubeijianzhu.py

@@ -1,6 +0,0 @@
-import requests
-
-url= 'http://jg.hbcic.net.cn/web/XmManage/XmxxSearch.aspx'
-res = requests.get(url)
-print(res.text)
-print(res.status_code)

+ 0 - 50
NoteWork/cesspider/jiangxistouces.py

@@ -1,50 +0,0 @@
-import requests
-
-headers = {
-    "Connection": "keep-alive",
-    "Cache-Control": "max-age=0",
-    "Upgrade-Insecure-Requests": "1",
-    "Origin": "http://www.ccgp-jilin.gov.cn",
-    "Content-Type": "application/x-www-form-urlencoded",
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
-    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-    "Referer": "http://www.ccgp-jilin.gov.cn/ext/search/morePolicyNews.action",
-    "Accept-Language": "zh-CN,zh;q=0.9"
-}
-cookies = {
-    "_gscu_1208125908": "40588857p73qs140",
-    "_gscbrs_1208125908": "1"
-}
-url = "http://www.ccgp-jilin.gov.cn/ext/search/morePolicyNews.action"
-data = {
-    "currentPage": "3",
-    "noticetypeId": "1",
-    "categoryId": "124",
-    "articleId": "1",
-    "ss": "41544c156ff82a74717a3c47c49a00d1017ab072dbbbba7bd0a0dbd087ba7b776b866418113ee01042af289114c1de749c6c79942d413015bdbfb6f2b59eba1280a7b2a6589c2a4db0d8ad7b2b5acfdd6e97a3fea1cf7cdf4bfa207d0990edf214eee9324d40425029e9fd958c810c6f86866257c925b4149bf76b6a8d338857",
-    "id": "1",
-    "pager.pageNumber": "1",
-    "las": "38"
-}
-response = requests.post(url, data=data, verify=False)
-
-print(response.text)
-print(response)
-if '前台同步提交表单使用验证方法之后无效' in response.text:
-    print('加密失败')
-import execjs
-
-# js_str = '''
-# function subdd(){
-# var _0x2adf75 = '10110';
-# var _0x5d50cf = '_0x40dee9['map']['modulus']';
-# var _0x209e59 = 'UsVjaS7Wj4';
-# var _0x3e26ea = '009f40b74f0e3bfcf449431eceedaa7984d852a754038eed36091fb0d6c1390b647f56dd82d8953c6e97678e20c7d3976ee3639dc386a2676578596bea3766d9f8f2402a8300b3cfb987dfeee63159ab1cdfe41f04fc2446f17ad8ee2878df59cba50ea4af18f6238172b55129dd7357adb90af15a3ed02bcc0bcc68d4f6c6696f';
-# _0x43ae35 = encryptedString(_0x3e26ea, encodeURIComponent(_0x209e59));
-# return _0x43ae35;}
-# '''
-# node_model_path = "D:/剑鱼爬虫/py_spiders/node_modules"
-# # with open('..cesspider/js/ces.js', 'rb') as f:
-# #     js_str = f.read()
-# ctx = execjs.compile(js_str)
-# res = ctx.call('subdd')

+ 0 - 80
NoteWork/cesspider/js/rsa/Barrett.js

@@ -1,80 +0,0 @@
-// BarrettMu, a class for performing Barrett modular reduction computations in
-// JavaScript.
-//
-// Requires BigInt.js.
-//
-// Copyright 2004-2005 David Shapiro.
-//
-// You may use, re-use, abuse, copy, and modify this code to your liking, but
-// please keep this header.
-//
-// Thanks!
-// 
-// Dave Shapiro
-// dave@ohdave.com 
-const BigInt = require("./BigInt");
-var biCopy = BigInt.biCopy
-var biHighIndex = BigInt.biHighIndex
-var bigInt = BigInt.bigInt
-var biDivide = BigInt.biDivide
-
-
-function BarrettMu(m)
-{
-	this.modulus = biCopy(m);
-	this.k = biHighIndex(this.modulus) + 1;
-	var b2k = new bigInt();
-	b2k.digits[2 * this.k] = 1; // b2k = b^(2k)
-	this.mu = biDivide(b2k, this.modulus);
-	this.bkplus1 = new bigInt();
-	this.bkplus1.digits[this.k + 1] = 1; // bkplus1 = b^(k+1)
-	this.modulo = BarrettMu_modulo;
-	this.multiplyMod = BarrettMu_multiplyMod;
-	this.powMod = BarrettMu_powMod;
-}
-
-function BarrettMu_modulo(x)
-{
-	var q1 = biDivideByRadixPower(x, this.k - 1);
-	var q2 = biMultiply(q1, this.mu);
-	var q3 = biDivideByRadixPower(q2, this.k + 1);
-	var r1 = biModuloByRadixPower(x, this.k + 1);
-	var r2term = biMultiply(q3, this.modulus);
-	var r2 = biModuloByRadixPower(r2term, this.k + 1);
-	var r = biSubtract(r1, r2);
-	if (r.isNeg) {
-		r = biAdd(r, this.bkplus1);
-	}
-	var rgtem = biCompare(r, this.modulus) >= 0;
-	while (rgtem) {
-		r = biSubtract(r, this.modulus);
-		rgtem = biCompare(r, this.modulus) >= 0;
-	}
-	return r;
-}
-
-function BarrettMu_multiplyMod(x, y)
-{
-	/*
-	x = this.modulo(x);
-	y = this.modulo(y);
-	*/
-	var xy = biMultiply(x, y);
-	return this.modulo(xy);
-}
-
-function BarrettMu_powMod(x, y)
-{
-	var result = new bigInt();
-	result.digits[0] = 1;
-	var a = x;
-	var k = y;
-	while (true) {
-		if ((k.digits[0] & 1) != 0) result = this.multiplyMod(result, a);
-		k = biShiftRight(k, 1);
-		if (k.digits[0] == 0 && biHighIndex(k) == 0) break;
-		a = this.multiplyMod(a, a);
-	}
-	return result;
-}
-module.exports.BarrettMu = BarrettMu

+ 0 - 614
NoteWork/cesspider/js/rsa/BigInt.js

@@ -1,614 +0,0 @@
-// BigInt, a suite of routines for performing multiple-precision arithmetic in
-// JavaScript.
-//
-// Copyright 1998-2005 David Shapiro.
-//
-// You may use, re-use, abuse,
-// copy, and modify this code to your liking, but please keep this header.
-// Thanks!
-//
-// Dave Shapiro
-// dave@ohdave.com
-
-// IMPORTANT THING: Be sure to set maxDigits according to your precision
-// needs. Use the setMaxDigits() function to do this. See comments below.
-//
-// Tweaked by Ian Bunning
-// Alterations:
-// Fix bug in function biFromHex(s) to allow
-// parsing of strings of length != 0 (mod 4)
-
-// Changes made by Dave Shapiro as of 12/30/2004:
-//
-// The BigInt() constructor doesn't take a string anymore. If you want to
-// create a BigInt from a string, use biFromDecimal() for base-10
-// representations, biFromHex() for base-16 representations, or
-// biFromString() for base-2-to-36 representations.
-//
-// biFromArray() has been removed. Use biCopy() instead, passing a BigInt
-// instead of an array.
-//
-// The BigInt() constructor now only constructs a zeroed-out array.
-// Alternatively, if you pass <true>, it won't construct any array. See the
-// biCopy() method for an example of this.
-//
-// Be sure to set maxDigits depending on your precision needs. The default
-// zeroed-out array ZERO_ARRAY is constructed inside the setMaxDigits()
-// function. So use this function to set the variable. DON'T JUST SET THE
-// VALUE. USE THE FUNCTION.
-//
-// ZERO_ARRAY exists to hopefully speed up construction of BigInts(). By
-// precalculating the zero array, we can just use slice(0) to make copies of
-// it. Presumably this calls faster native code, as opposed to setting the
-// elements one at a time. I have not done any timing tests to verify this
-// claim.
-
-// Max number = 10^16 - 2 = 9999999999999998;
-//               2^53     = 9007199254740992;
-
-var biRadixBase = 2;
-var biRadixBits = 16;
-var bitsPerDigit = biRadixBits;
-var biRadix = 1 << 16;
-// = 2^16 = 65536
-var biHalfRadix = biRadix >>> 1;
-var biRadixSquared = biRadix * biRadix;
-var maxDigitVal = biRadix - 1;
-var maxInteger = 9999999999999998;
-
-// maxDigits:
-// Change this to accommodate your largest number size. Use setMaxDigits()
-// to change it!
-//
-// In general, if you're working with numbers of size N bits, you'll need 2*N
-// bits of storage. Each digit holds 16 bits. So, a 1024-bit key will need
-//
-// 1024 * 2 / 16 = 128 digits of storage.
-//
-
-var maxDigits;
-var ZERO_ARRAY;
-var bigZero, bigOne;
-
-function setMaxDigits(value) {
-    maxDigits = value;
-    ZERO_ARRAY = new Array(maxDigits);
-    for (var iza = 0; iza < ZERO_ARRAY.length; iza++)
-        ZERO_ARRAY[iza] = 0;
-    bigZero = new BigInt();
-    bigOne = new BigInt();
-    bigOne.digits[0] = 1;
-}
-
-setMaxDigits(20);
-
-// The maximum number of digits in base 10 you can convert to an
-// integer without JavaScript throwing up on you.
-var dpl10 = 15;
-// lr10 = 10 ^ dpl10
-var lr10 = biFromNumber(1000000000000000);
-
-function BigInt(flag) {
-    if (typeof flag == "boolean" && flag == true) {
-        this.digits = null;
-    } else {
-        this.digits = ZERO_ARRAY.slice(0);
-    }
-    this.isNeg = false;
-}
-
-function biFromDecimal(s) {
-    var isNeg = s.charAt(0) == '-';
-    var i = isNeg ? 1 : 0;
-    var result;
-    // Skip leading zeros.
-    while (i < s.length && s.charAt(i) == '0')
-        ++i;
-    if (i == s.length) {
-        result = new BigInt();
-    } else {
-        var digitCount = s.length - i;
-        var fgl = digitCount % dpl10;
-        if (fgl == 0)
-            fgl = dpl10;
-        result = biFromNumber(Number(s.substr(i, fgl)));
-        i += fgl;
-        while (i < s.length) {
-            result = biAdd(biMultiply(result, lr10), biFromNumber(Number(s.substr(i, dpl10))));
-            i += dpl10;
-        }
-        result.isNeg = isNeg;
-    }
-    return result;
-}
-
-function biCopy(bi) {
-    var result = new BigInt(true);
-    result.digits = bi.digits.slice(0);
-    result.isNeg = bi.isNeg;
-    return result;
-}
-
-function biFromNumber(i) {
-    var result = new BigInt();
-    result.isNeg = i < 0;
-    i = Math.abs(i);
-    var j = 0;
-    while (i > 0) {
-        result.digits[j++] = i & maxDigitVal;
-        i >>= biRadixBits;
-    }
-    return result;
-}
-
-function reverseStr(s) {
-    var result = "";
-    for (var i = s.length - 1; i > -1; --i) {
-        result += s.charAt(i);
-    }
-    return result;
-}
-
-var hexatrigesimalToChar = new Array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z');
-
-function biToString(x, radix) // 2 <= radix <= 36
-{
-    var b = new BigInt();
-    b.digits[0] = radix;
-    var qr = biDivideModulo(x, b);
-    var result = hexatrigesimalToChar[qr[1].digits[0]];
-    while (biCompare(qr[0], bigZero) == 1) {
-        qr = biDivideModulo(qr[0], b);
-        digit = qr[1].digits[0];
-        result += hexatrigesimalToChar[qr[1].digits[0]];
-    }
-    return (x.isNeg ? "-" : "") + reverseStr(result);
-}
-
-function biToDecimal(x) {
-    var b = new BigInt();
-    b.digits[0] = 10;
-    var qr = biDivideModulo(x, b);
-    var result = String(qr[1].digits[0]);
-    while (biCompare(qr[0], bigZero) == 1) {
-        qr = biDivideModulo(qr[0], b);
-        result += String(qr[1].digits[0]);
-    }
-    return (x.isNeg ? "-" : "") + reverseStr(result);
-}
-
-var hexToChar = new Array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f');
-
-function digitToHex(n) {
-    var mask = 0xf;
-    var result = "";
-    for (i = 0; i < 4; ++i) {
-        result += hexToChar[n & mask];
-        n >>>= 4;
-    }
-    return reverseStr(result);
-}
-
-function biToHex(x) {
-    var result = "";
-    var n = biHighIndex(x);
-    for (var i = biHighIndex(x); i > -1; --i) {
-        result += digitToHex(x.digits[i]);
-    }
-    return result;
-}
-
-function charToHex(c) {
-    var ZERO = 48;
-    var NINE = ZERO + 9;
-    var littleA = 97;
-    var littleZ = littleA + 25;
-    var bigA = 65;
-    var bigZ = 65 + 25;
-    var result;
-
-    if (c >= ZERO && c <= NINE) {
-        result = c - ZERO;
-    } else if (c >= bigA && c <= bigZ) {
-        result = 10 + c - bigA;
-    } else if (c >= littleA && c <= littleZ) {
-        result = 10 + c - littleA;
-    } else {
-        result = 0;
-    }
-    return result;
-}
-
-function hexToDigit(s) {
-    var result = 0;
-    var sl = Math.min(s.length, 4);
-    for (var i = 0; i < sl; ++i) {
-        result <<= 4;
-        result |= charToHex(s.charCodeAt(i))
-    }
-    return result;
-}
-
-function biFromHex(s) {
-    var result = new BigInt();
-    var sl = s.length;
-    for (var i = sl, j = 0; i > 0; i -= 4,
-    ++j) {
-        result.digits[j] = hexToDigit(s.substr(Math.max(i - 4, 0), Math.min(i, 4)));
-    }
-    return result;
-}
-
-function biFromString(s, radix) {
-    var isNeg = s.charAt(0) == '-';
-    var istop = isNeg ? 1 : 0;
-    var result = new BigInt();
-    var place = new BigInt();
-    place.digits[0] = 1;
-    // radix^0
-    for (var i = s.length - 1; i >= istop; i--) {
-        var c = s.charCodeAt(i);
-        var digit = charToHex(c);
-        var biDigit = biMultiplyDigit(place, digit);
-        result = biAdd(result, biDigit);
-        place = biMultiplyDigit(place, radix);
-    }
-    result.isNeg = isNeg;
-    return result;
-}
-
-function biToBytes(x) // Returns a string containing raw bytes.
-{
-    var result = "";
-    for (var i = biHighIndex(x); i > -1; --i) {
-        result += digitToBytes(x.digits[i]);
-    }
-    return result;
-}
-
-function digitToBytes(n) // Convert two-byte digit to string containing both bytes.
-{
-    var c1 = String.fromCharCode(n & 0xff);
-    n >>>= 8;
-    var c2 = String.fromCharCode(n & 0xff);
-    return c2 + c1;
-}
-
-function biDump(b) {
-    return (b.isNeg ? "-" : "") + b.digits.join(" ");
-}
-
-function biAdd(x, y) {
-    var result;
-
-    if (x.isNeg != y.isNeg) {
-        y.isNeg = !y.isNeg;
-        result = biSubtract(x, y);
-        y.isNeg = !y.isNeg;
-    } else {
-        result = new BigInt();
-        var c = 0;
-        var n;
-        for (var i = 0; i < x.digits.length; ++i) {
-            n = x.digits[i] + y.digits[i] + c;
-            result.digits[i] = n & 0xffff;
-            c = Number(n >= biRadix);
-        }
-        result.isNeg = x.isNeg;
-    }
-    return result;
-}
-
-function biSubtract(x, y) {
-    var result;
-    if (x.isNeg != y.isNeg) {
-        y.isNeg = !y.isNeg;
-        result = biAdd(x, y);
-        y.isNeg = !y.isNeg;
-    } else {
-        result = new BigInt();
-        var n, c;
-        c = 0;
-        for (var i = 0; i < x.digits.length; ++i) {
-            n = x.digits[i] - y.digits[i] + c;
-            result.digits[i] = n & 0xffff;
-            // Stupid non-conforming modulus operation.
-            if (result.digits[i] < 0)
-                result.digits[i] += biRadix;
-            c = 0 - Number(n < 0);
-        }
-        // Fix up the negative sign, if any.
-        if (c == -1) {
-            c = 0;
-            for (var i = 0; i < x.digits.length; ++i) {
-                n = 0 - result.digits[i] + c;
-                result.digits[i] = n & 0xffff;
-                // Stupid non-conforming modulus operation.
-                if (result.digits[i] < 0)
-                    result.digits[i] += biRadix;
-                c = 0 - Number(n < 0);
-            }
-            // Result is opposite sign of arguments.
-            result.isNeg = !x.isNeg;
-        } else {
-            // Result is same sign.
-            result.isNeg = x.isNeg;
-        }
-    }
-    return result;
-}
-
-function biHighIndex(x) {
-    var result = x.digits.length - 1;
-    while (result > 0 && x.digits[result] == 0)
-        --result;
-    return result;
-}
-
-function biNumBits(x) {
-    var n = biHighIndex(x);
-    var d = x.digits[n];
-    var m = (n + 1) * bitsPerDigit;
-    var result;
-    for (result = m; result > m - bitsPerDigit; --result) {
-        if ((d & 0x8000) != 0)
-            break;
-        d <<= 1;
-    }
-    return result;
-}
-
-function biMultiply(x, y) {
-    var result = new BigInt();
-    var c;
-    var n = biHighIndex(x);
-    var t = biHighIndex(y);
-    var u, uv, k;
-
-    for (var i = 0; i <= t; ++i) {
-        c = 0;
-        k = i;
-        for (j = 0; j <= n; ++j,
-        ++k) {
-            uv = result.digits[k] + x.digits[j] * y.digits[i] + c;
-            result.digits[k] = uv & maxDigitVal;
-            c = uv >>> biRadixBits;
-        }
-        result.digits[i + n + 1] = c;
-    }
-    // Someone give me a logical xor, please.
-    result.isNeg = x.isNeg != y.isNeg;
-    return result;
-}
-
-function biMultiplyDigit(x, y) {
-    var n, c, uv;
-
-    result = new BigInt();
-    n = biHighIndex(x);
-    c = 0;
-    for (var j = 0; j <= n; ++j) {
-        uv = result.digits[j] + x.digits[j] * y + c;
-        result.digits[j] = uv & maxDigitVal;
-        c = uv >>> biRadixBits;
-    }
-    result.digits[1 + n] = c;
-    return result;
-}
-
-function arrayCopy(src, srcStart, dest, destStart, n) {
-    var m = Math.min(srcStart + n, src.length);
-    for (var i = srcStart, j = destStart; i < m; ++i,
-    ++j) {
-        dest[j] = src[i];
-    }
-}
-
-var highBitMasks = new Array(0x0000,0x8000,0xC000,0xE000,0xF000,0xF800,0xFC00,0xFE00,0xFF00,0xFF80,0xFFC0,0xFFE0,0xFFF0,0xFFF8,0xFFFC,0xFFFE,0xFFFF);
-
-function biShiftLeft(x, n) {
-    var digitCount = Math.floor(n / bitsPerDigit);
-    var result = new BigInt();
-    arrayCopy(x.digits, 0, result.digits, digitCount, result.digits.length - digitCount);
-    var bits = n % bitsPerDigit;
-    var rightBits = bitsPerDigit - bits;
-    for (var i = result.digits.length - 1, i1 = i - 1; i > 0; --i,
-    --i1) {
-        result.digits[i] = ((result.digits[i] << bits) & maxDigitVal) | ((result.digits[i1] & highBitMasks[bits]) >>> (rightBits));
-    }
-    result.digits[0] = ((result.digits[i] << bits) & maxDigitVal);
-    result.isNeg = x.isNeg;
-    return result;
-}
-
-var lowBitMasks = new Array(0x0000,0x0001,0x0003,0x0007,0x000F,0x001F,0x003F,0x007F,0x00FF,0x01FF,0x03FF,0x07FF,0x0FFF,0x1FFF,0x3FFF,0x7FFF,0xFFFF);
-
-function biShiftRight(x, n) {
-    var digitCount = Math.floor(n / bitsPerDigit);
-    var result = new BigInt();
-    arrayCopy(x.digits, digitCount, result.digits, 0, x.digits.length - digitCount);
-    var bits = n % bitsPerDigit;
-    var leftBits = bitsPerDigit - bits;
-    for (var i = 0, i1 = i + 1; i < result.digits.length - 1; ++i,
-    ++i1) {
-        result.digits[i] = (result.digits[i] >>> bits) | ((result.digits[i1] & lowBitMasks[bits]) << leftBits);
-    }
-    result.digits[result.digits.length - 1] >>>= bits;
-    result.isNeg = x.isNeg;
-    return result;
-}
-
-function biMultiplyByRadixPower(x, n) {
-    var result = new BigInt();
-    arrayCopy(x.digits, 0, result.digits, n, result.digits.length - n);
-    return result;
-}
-
-function biDivideByRadixPower(x, n) {
-    var result = new BigInt();
-    arrayCopy(x.digits, n, result.digits, 0, result.digits.length - n);
-    return result;
-}
-
-function biModuloByRadixPower(x, n) {
-    var result = new BigInt();
-    arrayCopy(x.digits, 0, result.digits, 0, n);
-    return result;
-}
-
-function biCompare(x, y) {
-    if (x.isNeg != y.isNeg) {
-        return 1 - 2 * Number(x.isNeg);
-    }
-    for (var i = x.digits.length - 1; i >= 0; --i) {
-        if (x.digits[i] != y.digits[i]) {
-            if (x.isNeg) {
-                return 1 - 2 * Number(x.digits[i] > y.digits[i]);
-            } else {
-                return 1 - 2 * Number(x.digits[i] < y.digits[i]);
-            }
-        }
-    }
-    return 0;
-}
-
-function biDivideModulo(x, y) {
-    var nb = biNumBits(x);
-    var tb = biNumBits(y);
-    var origYIsNeg = y.isNeg;
-    var q, r;
-    if (nb < tb) {
-        // |x| < |y|
-        if (x.isNeg) {
-            q = biCopy(bigOne);
-            q.isNeg = !y.isNeg;
-            x.isNeg = false;
-            y.isNeg = false;
-            r = biSubtract(y, x);
-            // Restore signs, 'cause they're references.
-            x.isNeg = true;
-            y.isNeg = origYIsNeg;
-        } else {
-            q = new BigInt();
-            r = biCopy(x);
-        }
-        return new Array(q,r);
-    }
-
-    q = new BigInt();
-    r = x;
-
-    // Normalize Y.
-    var t = Math.ceil(tb / bitsPerDigit) - 1;
-    var lambda = 0;
-    while (y.digits[t] < biHalfRadix) {
-        y = biShiftLeft(y, 1);
-        ++lambda;
-        ++tb;
-        t = Math.ceil(tb / bitsPerDigit) - 1;
-    }
-    // Shift r over to keep the quotient constant. We'll shift the
-    // remainder back at the end.
-    r = biShiftLeft(r, lambda);
-    nb += lambda;
-    // Update the bit count for x.
-    var n = Math.ceil(nb / bitsPerDigit) - 1;
-
-    var b = biMultiplyByRadixPower(y, n - t);
-    while (biCompare(r, b) != -1) {
-        ++q.digits[n - t];
-        r = biSubtract(r, b);
-    }
-    for (var i = n; i > t; --i) {
-        var ri = (i >= r.digits.length) ? 0 : r.digits[i];
-        var ri1 = (i - 1 >= r.digits.length) ? 0 : r.digits[i - 1];
-        var ri2 = (i - 2 >= r.digits.length) ? 0 : r.digits[i - 2];
-        var yt = (t >= y.digits.length) ? 0 : y.digits[t];
-        var yt1 = (t - 1 >= y.digits.length) ? 0 : y.digits[t - 1];
-        if (ri == yt) {
-            q.digits[i - t - 1] = maxDigitVal;
-        } else {
-            q.digits[i - t - 1] = Math.floor((ri * biRadix + ri1) / yt);
-        }
-
-        var c1 = q.digits[i - t - 1] * ((yt * biRadix) + yt1);
-        var c2 = (ri * biRadixSquared) + ((ri1 * biRadix) + ri2);
-        while (c1 > c2) {
-            --q.digits[i - t - 1];
-            c1 = q.digits[i - t - 1] * ((yt * biRadix) | yt1);
-            c2 = (ri * biRadix * biRadix) + ((ri1 * biRadix) + ri2);
-        }
-
-        b = biMultiplyByRadixPower(y, i - t - 1);
-        r = biSubtract(r, biMultiplyDigit(b, q.digits[i - t - 1]));
-        if (r.isNeg) {
-            r = biAdd(r, b);
-            --q.digits[i - t - 1];
-        }
-    }
-    r = biShiftRight(r, lambda);
-    // Fiddle with the signs and stuff to make sure that 0 <= r < y.
-    q.isNeg = x.isNeg != origYIsNeg;
-    if (x.isNeg) {
-        if (origYIsNeg) {
-            q = biAdd(q, bigOne);
-        } else {
-            q = biSubtract(q, bigOne);
-        }
-        y = biShiftRight(y, lambda);
-        r = biSubtract(y, r);
-    }
-    // Check for the unbelievably stupid degenerate case of r == -0.
-    if (r.digits[0] == 0 && biHighIndex(r) == 0)
-        r.isNeg = false;
-
-    return new Array(q,r);
-}
-
-function biDivide(x, y) {
-    return biDivideModulo(x, y)[0];
-}
-
-function biModulo(x, y) {
-    return biDivideModulo(x, y)[1];
-}
-
-function biMultiplyMod(x, y, m) {
-    return biModulo(biMultiply(x, y), m);
-}
-
-function biPow(x, y) {
-    var result = bigOne;
-    var a = x;
-    while (true) {
-        if ((y & 1) != 0)
-            result = biMultiply(result, a);
-        y >>= 1;
-        if (y == 0)
-            break;
-        a = biMultiply(a, a);
-    }
-    return result;
-}
-
-function biPowMod(x, y, m) {
-    var result = bigOne;
-    var a = x;
-    var k = y;
-    while (true) {
-        if ((k.digits[0] & 1) != 0)
-            result = biMultiplyMod(result, a, m);
-        k = biShiftRight(k, 1);
-        if (k.digits[0] == 0 && biHighIndex(k) == 0)
-            break;
-        a = biMultiplyMod(a, a, m);
-    }
-    return result;
-}
-
-module.exports.biFromHex = biFromHex
-module.exports.bigInt = BigInt
-module.exports.biHighIndex = biHighIndex
-module.exports.biCopy = biCopy
-module.exports.biHighIndex = biHighIndex
-module.exports.biDivide = biDivide

+ 0 - 583
NoteWork/cesspider/js/rsa/RSA.js

@@ -1,583 +0,0 @@
-/*
-* Copyright (c) 2015 Eric Wilde.
-* Copyright 1998-2015 David Shapiro.
-* 
-* RSA.js is a suite of routines for performing RSA public-key computations
-* in JavaScript.  The cryptographic functions herein are used for encoding
-* and decoding strings to be sent over unsecure channels.
-*
-* To use these routines, a pair of public/private keys is created through a
-* number of means (OpenSSL tools on Linux/Unix, Dave Shapiro's
-* RSAKeyGenerator program on Windows).  These keys are passed to RSAKeyPair
-* as hexadecimal strings to create an encryption key object.  This key object
-* is then used with encryptedString to encrypt blocks of plaintext using the
-* public key.  The resulting cyphertext blocks can be decrypted with
-* decryptedString.
-*
-* Note that the cryptographic functions herein are complementary to those
-* found in CryptoFuncs.php and CryptoFuncs.pm.  Hence, encrypted messages may
-* be sent between programs written in any of those languages.  The most
-* useful, of course is to send messages encrypted by a Web page using RSA.js
-* to a PHP or Perl script running on a Web servitron.
-*
-* Also, the optional padding flag may be specified on the call to
-* encryptedString, in which case blocks of cyphertext that are compatible
-* with real crypto libraries such as OpenSSL or Microsoft will be created.
-* These blocks of cyphertext can then be sent to Web servitron that uses one
-* of these crypto libraries for decryption.  This allows messages encrypted
-* with longer keys to be decrypted quickly on the Web server as well as
-* making for more secure communications when a padding algorithm such as
-* PKCS1v1.5 is used.
-*
-* These routines require BigInt.js and Barrett.js.
-*/
-
-/*****************************************************************************/
-
-/*
-* Modifications
-* -------------
-*
-* 2014 Jan 11  E. Wilde       Add optional padding flag to encryptedString
-*                             for compatibility with real crypto libraries
-*                             such as OpenSSL or Microsoft.  Add PKCS1v1.5
-*                             padding.
-*
-* 2015 Jan 5  D. Shapiro      Add optional encoding flag for encryptedString
-*                             and encapsulate padding and encoding constants
-*                             in RSAAPP object.
-*
-* Original Code
-* -------------
-*
-* Copyright 1998-2005 David Shapiro.
-*
-* You may use, re-use, abuse, copy, and modify this code to your liking, but
-* please keep this header.
-*
-* Thanks!
-* 
-* Dave Shapiro
-* dave@ohdave.com
-*/
-
-/*****************************************************************************/
-const BigInt = require("./BigInt");
-
-const Barrett = require("./Barrett");
-// const bigInt = require("./BigInt");
-// console.log(bigInt,biFromHex)
-var bigInt = BigInt.bigInt
-var biFromHex = BigInt.biFromHex
-var biHighIndex = BigInt.biHighIndex
-var BarrettMu = Barrett.BarrettMu
-var RSAAPP = {};
-
-RSAAPP.NoPadding = "NoPadding";
-RSAAPP.PKCS1Padding = "PKCS1Padding";
-RSAAPP.RawEncoding = "RawEncoding";
-RSAAPP.NumericEncoding = "NumericEncoding"
-
-/*****************************************************************************/
-
-function RSAKeyPair(encryptionExponent, decryptionExponent, modulus, keylen)
-/*
-* encryptionExponent                    The encryption exponent (i.e. public
-*                                       encryption key) to be used for
-*                                       encrypting messages.  If you aren't
-*                                       doing any encrypting, a dummy
-*                                       exponent such as "10001" can be
-*                                       passed.
-*
-* decryptionExponent                    The decryption exponent (i.e. private
-*                                       decryption key) to be used for
-*                                       decrypting messages.  If you aren't
-*                                       doing any decrypting, a dummy
-*                                       exponent such as "10001" can be
-*                                       passed.
-*
-* modulus                               The modulus to be used both for
-*                                       encrypting and decrypting messages.
-*
-* keylen                                The optional length of the key, in
-*                                       bits.  If omitted, RSAKeyPair will
-*                                       attempt to derive a key length (but,
-*                                       see the notes below).
-*
-* returns                               The "new" object creator returns an
-*                                       instance of a key object that can be
-*                                       used to encrypt/decrypt messages.
-*
-* This routine is invoked as the first step in the encryption or decryption
-* process to take the three numbers (expressed as hexadecimal strings) that
-* are used for RSA asymmetric encryption/decryption and turn them into a key
-* object that can be used for encrypting and decrypting.
-*
-* The key object is created thusly:
-*
-*      RSAKey = new RSAKeyPair("ABC12345", 10001, "987654FE");
-*
-* or:
-*
-*      RSAKey = new RSAKeyPair("ABC12345", 10001, "987654FE", 64);
-*
-* Note that RSAKeyPair will try to derive the length of the key that is being
-* used, from the key itself.  The key length is especially useful when one of
-* the padding options is used and/or when the encrypted messages created by
-* the routine encryptedString are exchanged with a real crypto library such
-* as OpenSSL or Microsoft, as it determines how many padding characters are
-* appended.
-*
-* Usually, RSAKeyPair can determine the key length from the modulus of the
-* key but this doesn't always work properly, depending on the actual value of
-* the modulus.  If you are exchanging messages with a real crypto library,
-* such as OpenSSL or Microsoft, that depends on the fact that the blocks
-* being passed to it are properly padded, you'll want the key length to be
-* set properly.  If that's the case, of if you just want to be sure, you
-* should specify the key length that you used to generated the key, in bits
-* when this routine is invoked.
-*/
-{
-/*
-* Convert from hexadecimal and save the encryption/decryption exponents and
-* modulus as big integers in the key object.
-*/
-this.e = biFromHex(encryptionExponent);
-this.d = biFromHex(decryptionExponent);
-this.m = biFromHex(modulus);
-/*
-* Using big integers, we can represent two bytes per element in the big
-* integer array, so we calculate the chunk size as:
-*
-*      chunkSize = 2 * (number of digits in modulus - 1)
-*
-* Since biHighIndex returns the high index, not the number of digits, the
-* number 1 has already been subtracted from its answer.
-*
-* However, having said all this, "User Knows Best".  If our caller passes us
-* a key length (in bits), we'll treat it as gospel truth.
-*/
-if (typeof(keylen) != 'number') { this.chunkSize = 2 * biHighIndex(this.m); }
-else { this.chunkSize = keylen / 8; }
-
-this.radix = 16;
-/*
-* Precalculate the stuff used for Barrett modular reductions.
-*/
-this.barrett = new BarrettMu(this.m);
-}
-
-/*****************************************************************************/
-
-function encryptedString(key, s, pad, encoding)
-/*
-* key                                   The previously-built RSA key whose
-*                                       public key component is to be used to
-*                                       encrypt the plaintext string.
-*
-* s                                     The plaintext string that is to be
-*                                       encrypted, using the RSA assymmetric
-*                                       encryption method.
-*
-* pad                                   The optional padding method to use
-*                                       when extending the plaintext to the
-*                                       full chunk size required by the RSA
-*                                       algorithm.  To maintain compatibility
-*                                       with other crypto libraries, the
-*                                       padding method is described by a
-*                                       string.  The default, if not
-*                                       specified is "OHDave".  Here are the
-*                                       choices:
-*
-*                                         OHDave - this is the original
-*                                           padding method employed by Dave
-*                                           Shapiro and Rob Saunders.  If
-*                                           this method is chosen, the
-*                                           plaintext can be of any length.
-*                                           It will be padded to the correct
-*                                           length with zeros and then broken
-*                                           up into chunks of the correct
-*                                           length before being encrypted.
-*                                           The resultant cyphertext blocks
-*                                           will be separated by blanks.
-*
-*                                           Note that the original code by
-*                                           Dave Shapiro reverses the byte
-*                                           order to little-endian, as the
-*                                           plaintext is encrypted.  If
-*                                           either these JavaScript routines
-*                                           or one of the complementary
-*                                           PHP/Perl routines derived from
-*                                           this code is used for decryption,
-*                                           the byte order will be reversed
-*                                           again upon decryption so as to
-*                                           come out correctly.
-*                                           
-*                                           Also note that this padding
-*                                           method is claimed to be less
-*                                           secure than PKCS1Padding.
-*
-*                                         NoPadding - this method truncates
-*                                           the plaintext to the length of
-*                                           the RSA key, if it is longer.  If
-*                                           its length is shorter, it is
-*                                           padded with zeros.  In either
-*                                           case, the plaintext string is
-*                                           reversed to preserve big-endian
-*                                           order before it is encrypted to
-*                                           maintain compatibility with real
-*                                           crypto libraries such as OpenSSL
-*                                           or Microsoft.  When the
-*                                           cyphertext is to be decrypted
-*                                           by a crypto library, the
-*                                           library routine's RSAAPP.NoPadding
-*                                           flag, or its equivalent, should
-*                                           be used.
-*
-*                                           Note that this padding method is
-*                                           claimed to be less secure than
-*                                           PKCS1Padding.
-*
-*                                         PKCS1Padding - the PKCS1v1.5
-*                                           padding method (as described in
-*                                           RFC 2313) is employed to pad the
-*                                           plaintext string.  The plaintext
-*                                           string must be no longer than the
-*                                           length of the RSA key minus 11,
-*                                           since PKCS1v1.5 requires 3 bytes
-*                                           of overhead and specifies a
-*                                           minimum pad of 8 bytes.  The
-*                                           plaintext string is padded with
-*                                           randomly-generated bytes and then
-*                                           its order is reversed to preserve
-*                                           big-endian order before it is
-*                                           encrypted to maintain
-*                                           compatibility with real crypto
-*                                           libraries such as OpenSSL or
-*                                           Microsoft.  When the cyphertext
-*                                           is to be decrypted by a crypto
-*                                           library, the library routine's
-*                                           RSAAPP.PKCS1Padding flag, or its
-*                                           equivalent, should be used.
-*
-* encoding                              The optional encoding scheme to use
-*                                       for the return value. If ommitted,
-*                                       numeric encoding will be used.
-*
-*                                           RawEncoding - The return value
-*                                           is given as its raw value.
-*                                           This is the easiest method when
-*                                           interoperating with server-side
-*                                           OpenSSL, as no additional conversion
-*                                           is required. Use the constant
-*                                           RSAAPP.RawEncoding for this option.
-*
-*                                           NumericEncoding - The return value
-*                                           is given as a number in hexadecimal.
-*                                           Perhaps useful for debugging, but
-*                                           will need to be translated back to
-*                                           its raw equivalent (e.g. using
-*                                           PHP's hex2bin) before using with
-*                                           OpenSSL. Use the constant
-*                                           RSAAPP.NumericEncoding for this option.
-*
-* returns                               The cyphertext block that results
-*                                       from encrypting the plaintext string
-*                                       s with the RSA key.
-*
-* This routine accepts a plaintext string that is to be encrypted with the
-* public key component of the previously-built RSA key using the RSA
-* assymmetric encryption method.  Before it is encrypted, the plaintext
-* string is padded to the same length as the encryption key for proper
-* encryption.
-*
-* Depending on the padding method chosen, an optional header with block type
-* is prepended, the plaintext is padded using zeros or randomly-generated
-* bytes, and then the plaintext is possibly broken up into chunks.
-*
-* Note that, for padding with zeros, this routine was altered by Rob Saunders
-* (rob@robsaunders.net). The new routine pads the string after it has been
-* converted to an array. This fixes an incompatibility with Flash MX's
-* ActionScript.
-*
-* The various padding schemes employed by this routine, and as presented to
-* RSA for encryption, are shown below.  Note that the RSA encryption done
-* herein reverses the byte order as encryption is done:
-*
-*      Plaintext In
-*      ------------
-*
-*      d5 d4 d3 d2 d1 d0
-*
-*      OHDave
-*      ------
-*
-*      d5 d4 d3 d2 d1 d0 00 00 00 /.../ 00 00 00 00 00 00 00 00
-*
-*      NoPadding
-*      ---------
-*
-*      00 00 00 00 00 00 00 00 00 /.../ 00 00 d0 d1 d2 d3 d4 d5
-*
-*      PKCS1Padding
-*      ------------
-*
-*      d0 d1 d2 d3 d4 d5 00 p0 p1 /.../ p2 p3 p4 p5 p6 p7 02 00
-*                            \------------  ------------/
-*                                         \/
-*                             Minimum 8 bytes pad length
-*/
-{
-var a = new Array();                    // The usual Alice and Bob stuff
-var sl = s.length;                      // Plaintext string length
-var i, j, k;                            // The usual Fortran index stuff
-var padtype;                            // Type of padding to do
-var encodingtype;                       // Type of output encoding
-var rpad;                               // Random pad
-var al;                                 // Array length
-var result = "";                        // Cypthertext result
-var block;                              // Big integer block to encrypt
-var crypt;                              // Big integer result
-var text;                               // Text result
-/*
-* Figure out the padding type.
-*/
-if (typeof(pad) == 'string') {
-  if (pad == RSAAPP.NoPadding) { padtype = 1; }
-  else if (pad == RSAAPP.PKCS1Padding) { padtype = 2; }
-  else { padtype = 0; }
-}
-else { padtype = 0; }
-/*
-* Determine encoding type.
-*/
-if (typeof(encoding) == 'string' && encoding == RSAAPP.RawEncoding) {
-	encodingtype = 1;
-}
-else { encodingtype = 0; }
-
-/*
-* If we're not using Dave's padding method, we need to truncate long
-* plaintext blocks to the correct length for the padding method used:
-*
-*       NoPadding    - key length
-*       PKCS1Padding - key length - 11
-*/
-if (padtype == 1) {
-  if (sl > key.chunkSize) { sl = key.chunkSize; }
-}
-else if (padtype == 2) {
-  if (sl > (key.chunkSize-11)) { sl = key.chunkSize - 11; }
-}
-/*
-* Convert the plaintext string to an array of characters so that we can work
-* with individual characters.
-*
-* Note that, if we're talking to a real crypto library at the other end, we
-* reverse the plaintext order to preserve big-endian order.
-*/
-i = 0;
-
-if (padtype == 2) { j = sl - 1; }
-else { j = key.chunkSize - 1; }
-
-while (i < sl) {
-  if (padtype) { a[j] = s.charCodeAt(i); }
-  else { a[i] = s.charCodeAt(i); }
-
-  i++; j--;
-}
-/*
-* Now is the time to add the padding.
-*
-* If we're doing PKCS1v1.5 padding, we pick up padding where we left off and
-* pad the remainder of the block.  Otherwise, we pad at the front of the
-* block.  This gives us the correct padding for big-endian blocks.
-*
-* The padding is either a zero byte or a randomly-generated non-zero byte.
-*/
-if (padtype == 1) { i = 0; }
-
-j = key.chunkSize - (sl % key.chunkSize);
-
-while (j > 0) {
-  if (padtype == 2) {
-    rpad = Math.floor(Math.random() * 256);
-
-    while (!rpad) { rpad = Math.floor(Math.random() * 256); }
-
-    a[i] = rpad;
-  }
-  else { a[i] = 0; }
-
-  i++; j--;
-}
-/*
-* For PKCS1v1.5 padding, we need to fill in the block header.
-*
-* According to RFC 2313, a block type, a padding string, and the data shall
-* be formatted into the encryption block:
-*
-*      EncrBlock = 00 || BlockType || PadString || 00 || Data
-*
-* The block type shall be a single octet indicating the structure of the
-* encryption block. For this version of the document it shall have value 00,
-* 01, or 02. For a private-key operation, the block type shall be 00 or 01.
-* For a public-key operation, it shall be 02.
-*
-* The padding string shall consist of enough octets to pad the encryption
-* block to the length of the encryption key.  For block type 00, the octets
-* shall have value 00; for block type 01, they shall have value FF; and for
-* block type 02, they shall be pseudorandomly generated and nonzero.
-*
-* Note that in a previous step, we wrote padding bytes into the first three
-* bytes of the encryption block because it was simpler to do so.  We now
-* overwrite them.
-*/
-if (padtype == 2)
-  {
-  a[sl] = 0;
-  a[key.chunkSize-2] = 2;
-  a[key.chunkSize-1] = 0;
-  }
-/*
-* Carve up the plaintext and encrypt each of the resultant blocks.
-*/
-al = a.length;
-
-for (i = 0; i < al; i += key.chunkSize) {
-  /*
-  * Get a block.
-  */
-  block = new bigInt();
-
-  j = 0;
-
-  for (k = i; k < (i+key.chunkSize); ++j) {
-    block.digits[j] = a[k++];
-    block.digits[j] += a[k++] << 8;
-  }
-  /*
-  * Encrypt it, convert it to text, and append it to the result.
-  */
-  crypt = key.barrett.powMod(block, key.e);
-  if (encodingtype == 1) {
-	  text = biToBytes(crypt);
-  }
-  else {
-	  text = (key.radix == 16) ? biToHex(crypt) : biToString(crypt, key.radix);
-  }
-  result += text;
-}
-/*
-* Return the result, removing the last space.
-*/
-//result = (result.substring(0, result.length - 1));
-return result;
-}
-
-/*****************************************************************************/
-
-function decryptedString(key, c)
-/*
-* key                                   The previously-built RSA key whose
-*                                       private key component is to be used
-*                                       to decrypt the cyphertext string.
-*
-* c                                     The cyphertext string that is to be
-*                                       decrypted, using the RSA assymmetric
-*                                       encryption method.
-*
-* returns                               The plaintext block that results from
-*                                       decrypting the cyphertext string c
-*                                       with the RSA key.
-*
-* This routine is the complementary decryption routine that is meant to be
-* used for JavaScript decryption of cyphertext blocks that were encrypted
-* using the OHDave padding method of the encryptedString routine (in this
-* module).  It can also decrypt cyphertext blocks that were encrypted by
-* RSAEncode (in CryptoFuncs.pm or CryptoFuncs.php) so that encrypted
-* messages can be sent of insecure links (e.g. HTTP) to a Web page.
-*
-* It accepts a cyphertext string that is to be decrypted with the public key
-* component of the previously-built RSA key using the RSA assymmetric
-* encryption method.  Multiple cyphertext blocks are broken apart, if they
-* are found in c, and each block is decrypted.  All of the decrypted blocks
-* are concatenated back together to obtain the original plaintext string.
-*
-* This routine assumes that the plaintext was padded to the same length as
-* the encryption key with zeros.  Therefore, it removes any zero bytes that
-* are found at the end of the last decrypted block, before it is appended to
-* the decrypted plaintext string.
-*
-* Note that the encryptedString routine (in this module) works fairly quickly
-* simply by virtue of the fact that the public key most often chosen is quite
-* short (e.g. 0x10001).  This routine does not have that luxury.  The
-* decryption key that it must employ is the full key length.  For long keys,
-* this can result in serious timing delays (e.g. 7-8 seconds to decrypt using
-* 2048 bit keys on a reasonably fast machine, under the Firefox Web browser).
-*
-* If you intend to send encrypted messagess to a JavaScript program running
-* under a Web browser, you might consider using shorter keys to keep the
-* decryption times low.  Alternately, a better scheme is to generate a random
-* key for use by a symmetric encryption algorithm and transmit it to the
-* other end, after encrypting it with encryptedString.  The other end can use
-* a real crypto library (e.g. OpenSSL or Microsoft) to decrypt the key and
-* then use it to encrypt all of the messages (with a symmetric encryption
-* algorithm such as Twofish or AES) bound for the JavaScript program.
-* Symmetric decryption is orders of magnitude faster than asymmetric and
-* should yield low decryption times, even when executed in JavaScript.
-*
-* Also note that only the OHDave padding method (e.g. zeros) is supported by
-* this routine *AND* that this routine expects little-endian cyphertext, as
-* created by the encryptedString routine (in this module) or the RSAEncode
-* routine (in either CryptoFuncs.pm or CryptoFuncs.php).  You can use one of
-* the real crypto libraries to create cyphertext that can be decrypted by
-* this routine, if you reverse the plaintext byte order first and then
-* manually pad it with zero bytes.  The plaintext should then be encrypted
-* with the NoPadding flag or its equivalent in the crypto library of your
-* choice.
-*/
-{
-var blocks = c.split(" ");              // Multiple blocks of cyphertext
-var b;                                  // The usual Alice and Bob stuff
-var i, j;                               // The usual Fortran index stuff
-var bi;                                 // Cyphertext as a big integer
-var result = "";                        // Plaintext result
-/*
-* Carve up the cyphertext into blocks.
-*/
-for (i = 0; i < blocks.length; ++i) {
-  /*
-  * Depending on the radix being used for the key, convert this cyphertext
-  * block into a big integer.
-  */
-  if (key.radix == 16) { bi = biFromHex(blocks[i]); }
-  else { bi = biFromString(blocks[i], key.radix); }
-  /*
-  * Decrypt the cyphertext.
-  */
-  b = key.barrett.powMod(bi, key.d);
-  /*
-  * Convert the decrypted big integer back to the plaintext string.  Since
-  * we are using big integers, each element thereof represents two bytes of
-  * plaintext.
-  */
-  for (j = 0; j <= biHighIndex(b); ++j) {
-    result += String.fromCharCode(b.digits[j] & 255, b.digits[j] >> 8);
-  }
-}
-/*
-* Remove trailing null, if any.
-*/
-if (result.charCodeAt(result.length - 1) == 0) {
-  result = result.substring(0, result.length - 1);
-}
-/*
-* Return the plaintext.
-*/
-return (result);
-}
-
-// export {RSAKeyPair}
-module.exports = RSAKeyPair;

+ 0 - 0
NoteWork/cesspider/js/rsa/__init__.py


+ 0 - 109
NoteWork/cesspider/magpces.py

@@ -1,109 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-09-06 14:21
----------
-@summary: 工具
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import datetime
-import json
-import re
-from pprint import pformat
-print('sssssssss')
-_regexs = {}
-def get_info(html, regexs, allow_repeat=True, fetch_one=False, split=None):
-    regexs = isinstance(regexs, str) and [regexs] or regexs
-
-    infos = []
-    for regex in regexs:
-        if regex == "":
-            continue
-
-        if regex not in _regexs.keys():
-            _regexs[regex] = re.compile(regex, re.S)
-
-        if fetch_one:
-            infos = _regexs[regex].search(html)
-            if infos:
-                infos = infos.groups()
-            else:
-                continue
-        else:
-            infos = _regexs[regex].findall(str(html))
-
-        if len(infos) > 0:
-            # print(regex)
-            break
-
-    if fetch_one:
-        infos = infos if infos else ("",)
-        return infos if len(infos) > 1 else infos[0]
-    else:
-        infos = allow_repeat and infos or sorted(set(infos), key=infos.index)
-        infos = split.join(infos) if split else infos
-        return infos
-def get_json(json_str):
-    """
-    @summary: 取json对象
-    ---------
-    @param json_str: json格式的字符串
-    ---------
-    @result: 返回json对象
-    """
-
-    try:
-        return json.loads(json_str) if json_str else {}
-    except Exception as e1:
-        try:
-            json_str = json_str.strip()
-            json_str = json_str.replace("'", '"')
-            keys = get_info(json_str, "(\w+):")
-            for key in keys:
-                json_str = json_str.replace(key, '"%s"' % key)
-
-            return json.loads(json_str) if json_str else {}
-
-        except Exception as e2:
-            print(
-                """
-                e1: %s
-                format json_str: %s
-                e2: %s
-                """
-                % (e1, json_str, e2)
-            )
-
-        return {}
-
-
-def dumps_json(json_, indent=4, sort_keys=False):
-    """
-    @summary: 格式化json 用于打印
-    ---------
-    @param json_: json格式的字符串或json对象
-    ---------
-    @result: 格式化后的字符串
-    """
-    try:
-        if isinstance(json_, str):
-            json_ = get_json(json_)
-
-        json_ = json.dumps(
-            json_, ensure_ascii=False, indent=indent, skipkeys=True, sort_keys=sort_keys
-        )
-
-    except Exception as e:
-        print(e)
-        json_ = pformat(json_)
-
-    return json_
-def get_current_date(date_format="%Y-%m-%d %H:%M:%S"):
-    return datetime.datetime.now().strftime(date_format)
-    # return time.strftime(date_format, time.localtime(time.time()))
-def key2hump(key):
-    """
-    下划线试变成首字母大写
-    """
-    return key.title().replace("_", "")

+ 0 - 95
NoteWork/cesspider/中国南方电网电子采购交易平台.py

@@ -1,95 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-22 11:13:05
----------
-@summary:
----------
-@author: 马国鹏
-"""
-import json
-
-import feapder
-from items.spider_item import DataBakItem,MgpListItem
-from feapder.dedup import Dedup
-from collections import namedtuple
-from feapder.utils.tools import timestamp_to_date
-
-class Zgnfdzcgjypt(feapder.Spider):
-
-    def start_callback(self):
-         self.count = 0
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-         self.menus = [
-             # Menu('Zgnfdzcgjypt', 'Zgnfdzcgjypt', "Notice", 1),
-             Menu('招标采购公告', 'a_zgnfdwdzcgjypt_zbcggg', "Notice", 3),
-         ]
-
-    def start_requests(self):
-        for menu in self.menus:
-            start_url = "https://ecsg.com.cn/api/tender/tendermanage/gatewayNoticeQueryController/queryGatewayNoticeListPagination"
-            for page in range(menu.crawl_page):
-                data = {
-                    "noticeTitle": "",
-                    "publishTime": "",
-                    "organizationInfoName": "",
-                    "pageNo": page + 1,
-                    "pageSize": 20
-                }
-                data = json.dumps(data)
-                yield feapder.Request(url=start_url, item=menu._asdict(), method='POST', data=data)
-
-    def parse(self, request, response):
-        # print(response.text)
-        menu = request.item
-        self.count += 1   # 一个计数器
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.json.get("list")
-        for info in info_list:
-            # href = ''
-            title = info.get("noticeTitle")
-            create_time = info.get("publishTime")
-            create_time = timestamp_to_date(int(create_time/1000), time_format="%Y-%m-%d %H:%M:%S")
-            href = f'https://ecsg.com.cn/cms/NoticeDetail.html?objectId={info.get("objectId")}&objectType={info.get("objectType")}&typeid=4'
-            data = {"objectId": info.get('objectId'), "objectType": "1"}
-            data = json.dumps(data)
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = "中国南方电网电子采购交易平台"
-            data_item.area = "全国"  # 城市默认:全国
-            data_item.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_json"
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.request_params = {
-                "data":data,"method":"POST"}
-            list_item.deal_detail = '''
-html = response.json.get("noticeContent")
-            '''
-            list_item.author = "马国鹏"
-            list_item.parse_url = "https://ecsg.com.cn/api/tender/tendermanage/gatewayNoticeQueryController/getNotice"
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-    def download_midware(self, request):
-        request.headers = {
-            "Content-Type": "application/json;charset=UTF-8",
-            "Referer": "https://ecsg.com.cn/cms/NoticeList.html?id=159&typeid=4&word=&seacrhDate=",
-            "Accept-Language": "zh-CN,zh;q=0.9"
-        }
-
-if __name__ == "__main__":
-    Zgnfdzcgjypt(redis_key="fwork:Zgnfdzcgjypt").start()

+ 0 - 133
NoteWork/cesspider/中国鲁班商务委.py

@@ -1,133 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-01-20 13:49:04
----------
-@summary: Zglbsww
----------
-@author: dist
-"""
-import json
-import sys
-
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from items.spider_item import DataBakItem,MgpListItem,ListItem
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-
-class Zglbsww(feapder.Spider):
-
-    def start_callback(self):
-         Menu = namedtuple('Menu', ['channel', 'code', 'purchaseType',"orders", 'crawl_page'])
-         self.site= "中铁鲁班商务网"
-
-         self.menus = [
-             Menu('公告补遗-招标采购', 'a_ztlbsww_zhbgg', "CRFQ","publish_time", 1),
-             Menu('公告补遗-询价采购', 'a_ztlbsww_ggby_xjcg', "XJFQ","publish_time", 1),
-             Menu('公告补遗-竞争性谈判', 'a_ztlbsww_cqby', "TPFQ","publish_time", 1),
-             Menu('公告补遗-竞价采购', 'a_ztlbsww_ggby_jjcg', "JJFQ","publish_time", 1),
-
-             Menu('采购公告-招标采购', 'a_ztlbsww_zbgg', "CRFQ","pub_time", 1),
-             Menu('采购公告-询价采购', 'a_ztlbsww_lsxjcg', "XJFQ","pub_time", 1),
-             Menu('采购公告-竞争性谈判', 'a_ztlbsww_jzxtp', "TPFQ","pub_time", 1),
-             Menu('采购公告-竞价采购', 'a_ztlbsww_jjcg', "JJFQ","pub_time", 1),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-             for page in range(1,menu.crawl_page+1):
-                 '''
-                 https://eproport.crecgec.com/epu-portal/portal/project/listWithPage
-                 https://eproport.crecgec.com/epu-portal/portal/project/listWithPage
-                 '''
-                 start_url = f'https://eproport.crecgec.com/epu-portal/portal/project/listWithPage'
-                 data = {
-                     "timeType": "month",
-                     "areaCode": "-1",
-                     "mainType": "-1",
-                     "purchaser": None,
-                     "information": None,
-                     "sTime": "",
-                     "eTime": "",
-                     "classify": "-1",
-                     "region": "-1",
-                     "level": "",
-                     "selectedState": "",
-                     "purchaseType": menu.purchaseType,
-                     "noticeType": 1,
-                     "orders": menu.orders,
-                     "dirs": "desc",
-                     "current": page,
-                     "size": 10,
-                     "page": {}
-                 }
-                 data = json.dumps(data)
-
-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False,method="POST",data=data)
-    def parse(self, request, response):
-        print(response.text)
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.json.get("data").get("records")
-        for info in info_list:
-            projectid = info.get("projectId")
-            tenantid = info.get("tenantId")
-            href = f'https://eproport.crecgec.com/#/notice/noticexj-detail?projectId={projectid}&tenantId={tenantid}'
-            title = info.get("projectName")
-            create_time = info.get("publishTime") + ":00"
-            area = "全国"  # 省份
-            city = ""  # 城市
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = self.site
-            data_item.area = area  # 城市默认:全国
-            data_item.city = city  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details_ztlbw"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//*']
-            list_item.proxies = False
-            list_item.render_time = 3
-            list_item.parse_url = href
-            list_item.pri = 1
-            list_item.files={
-                "list_xpath":'//div[@class="****"]/a',
-                "url_xpath":'./@href',
-                "name_xpath":'./text()',
-                "files_type":('zip','doxc','ftp'), # 需要下载的附件类型
-                # "file_type":'zip', # 默认的附件类型,用于url中未带附件类型的
-                "url_key":'http', # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
-                # "host":'http://www.ceshi.com',  # 需要拼接url的host
-            }
-            href_list.append(href)
-            yield list_item
-        list = ListItem()
-        list.site = self.site
-        list.channel = menu.get("channel")
-        list.spidercode = menu.get("code")
-        list.url = request.url
-        list.count = len(info_list)
-        list.rel_count = len(href_list)
-        yield list
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-    def download_midware(self, request):
-        request.headers = {
-
-            "Content-Type": "application/json"
-        }
-if __name__ == "__main__":
-    Zglbsww(redis_key="dist:Zglbsww").start()

+ 0 - 76
NoteWork/cesspider/交通银行供应商门户.py

@@ -1,76 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-22 10:30:30
----------
-@summary:
----------
-@author: 马国鹏
-"""
-
-import feapder
-from items.spider_item import DataBakItem,MgpListItem
-from untils.proxy_pool import ProxyPool
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-
-class Jtyhgysmh(feapder.Spider):
-
-    def start_callback(self):
-         self.count = 0
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-         self.menus = [
-             # Menu('Jtyhgysmh', 'Jtyhgysmh', "Notice", 1),
-             Menu('Jtyhgysmh', 'Jtyhgysmh', "Notice", 1),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-            start_url = f'https://bocom-gys.bankcomm.com/espuser/register/noticePage'
-            yield feapder.Request(url=start_url, item=menu._asdict(),render=True,render_time=2)
-
-    def parse(self, request, response):
-        print(response.text)
-        import pdb
-        pdb.set_trace()
-        driver = response.browser
-        driver.refresh()
-        # driver.get(start_url)
-        menu = request.item
-        self.count += 1   # 一个计数器
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = []
-        for info in info_list:
-            href = ''
-            title = ''
-            create_time = ''
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = "*******记得编辑平台名称"
-            data_item.area = "全国"  # 城市默认:全国
-            data_item.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.xpath = ['//****',"*****"]
-            list_item.author = "****"
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    Jtyhgysmh(redis_key="fwork:Jtyhgysmh").start()

+ 0 - 91
NoteWork/cesspider/华创迅采电子采购平台.py

@@ -1,91 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-01-04 13:48:52
----------
-@summary:  华创迅采电子采购平台   详情信息需登录
----------
-@author: topnet
-"""
-import json
-from urllib.parse import urljoin
-
-import feapder
-from items.spider_item import DataBakItem, MgpListItem
-from untils.proxy_pool import ProxyPool
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-
-class Hcxcdzcgpt(feapder.Spider):
-
-    def start_callback(self):
-        self.count = 0
-        self.host= 'https://www.bzeps.com/'
-        Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-        self.menus = [
-            Menu('Hcxcdzcgpt', 'Hcxcdzcgpt', "Notice", 1),
-            # Menu('Hcxcdzcgpt', 'Hcxcdzcgpt', "Notice", 1),
-        ]
-
-    def start_requests(self):
-        for menu in self.menus:
-            for page in range(1, menu.crawl_page + 1):
-                start_url = f'https://www.bzeps.com/list/purchase/{page}'
-                data = {
-                    "code": "purchase",
-                    "keyword": "",
-                    "searchType": ""
-                }
-                data = json.dumps(data)
-                yield feapder.Request(url=start_url, item=menu._asdict(), method="POST", data=data)
-
-    def parse(self, request, response):
-        menu = request.item
-        self.count += 1  # 一个计数器
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.json.get("list")
-        for info in info_list:
-            href = urljoin(self.host, info.get("url"))
-            title = info.get("title")
-            create_time = info.get("pubTime")
-            area = info.get("area")
-            pro = area.split("-")[0]+"省"
-            city = area.split("-")[-1]
-            print(create_time,pro,city,title)
-            print(href)
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = "华创迅采电子采购平台"
-            data_item.area = pro  # 城市默认:全国
-            data_item.city = city  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item = MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//div[@class="gbox"]']
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-    def download_midware(self, request):
-        request.headers = {
-            "Content-Type": "application/json; charset=UTF-8",
-        }
-        return request
-
-
-if __name__ == "__main__":
-    Hcxcdzcgpt(redis_key="fwork:Hcxcdzcgpt").start()

+ 0 - 70
NoteWork/cesspider/国家税务总局宁波市税务局.py

@@ -1,70 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-22 11:01:07
----------
-@summary:
----------
-@author: 马国鹏
-"""
-
-import feapder
-from items.spider_item import DataBakItem,MgpListItem
-from untils.proxy_pool import ProxyPool
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-
-class Gjswjzjnbswj(feapder.Spider):
-
-    def start_callback(self):
-         self.count = 0
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-         self.menus = [
-             Menu('Gjswjzjnbswj', 'Gjswjzjnbswj', "Notice", 1),
-             Menu('Gjswjzjnbswj', 'Gjswjzjnbswj', "Notice", 1),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-            start_url = f''
-            yield feapder.Request(url=start_url, item=menu._asdict())
-
-    def parse(self, request, response):
-        menu = request.item
-        self.count += 1   # 一个计数器
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = []
-        for info in info_list:
-            href = ''
-            title = ''
-            create_time = ''
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = "*******记得编辑平台名称"
-            data_item.area = "全国"  # 城市默认:全国
-            data_item.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.xpath = ['//****',"*****"]
-            list_item.author = "****"
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    Gjswjzjnbswj(redis_key="fwork:Gjswjzjnbswj").start()

+ 0 - 80
NoteWork/cesspider/城轨采购网.py

@@ -1,80 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-27 10:54:26
----------
-@summary:
----------
-@author: topnet
-"""
-
-import feapder
-from items.spider_item import DataBakItem,MgpListItem
-from untils.proxy_pool import ProxyPool
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-def gotoPage(types,fid):  #onclick 的函数,生成url
-    if types == "1" or types == "2":  # 比价公告
-        return "/Purchase/Notice/NewDetail?Id="+fid
-    elif types == "3":  # 在线询价
-        return "https://work.mtrmart.com/Modules/SpareParts/SparePartsDispatch.ashx?ID=" + fid + "&AddNew=0"
-    elif types == "4": # 招标项目
-        return "/Bids/BidsNotice/NewDetail?Id="+fid
-    elif types == "5": #单一来源公示
-        return "/SingleSourceNotice/Notice/NewDetail?Id=" + fid
-
-class Cgcgw(feapder.Spider):
-
-    def start_callback(self):
-         self.count = 0
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-         self.menus = [
-             Menu('Cgcgw', 'Cgcgw', "Notice", 1),
-             Menu('Cgcgw', 'Cgcgw', "Notice", 1),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-            start_url = f'https://www.mtrmart.com/Purchase/Notice/SearchNewList?title=&category=&noticeType=&noticeTypeStr=&NoSinglesource=&companyValue=&isInProgress=n&isOneYear=y&page=2&pageSize=10'
-            yield feapder.Request(url=start_url, item=menu._asdict())
-
-    def parse(self, request, response):
-        print(response.text)
-        menu = request.item
-        self.count += 1   # 一个计数器
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.xpath('//ul[@class="base-list"]/li')
-        for info in info_list:
-            href = "https://www.mtrmart.com/" + eval(info.xpath('./h6/span/@onclick').extract_first().strip(";"))
-            title = info.xpath('./h6/@title').extract_first()
-            create_time = info.xpath('./p/span[2]/text()').extract_first()
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = "城轨采购网"
-            data_item.area = "全国"  # 城市默认:全国
-            data_item.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//****',"*****"]
-            list_item.author = "****"
-            list_item.parse_url = href
-            href_list.append(href)
-        #     yield list_item
-        # dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    Cgcgw(redis_key="fwork:Cgcgw").start()

+ 0 - 74
NoteWork/cesspider/山西省招标投标协会.py

@@ -1,74 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-27 10:34:50
----------
-@summary:
----------
-@author: topnet
-"""
-
-import feapder
-from items.spider_item import DataBakItem,MgpListItem
-from untils.proxy_pool import ProxyPool
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-
-class Sxsztbxh(feapder.Spider):
-
-    def start_callback(self):
-         self.count = 0
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-         self.menus = [
-             Menu('Sxsztbxh', 'Sxsztbxh', "Notice", 1),
-             Menu('Sxsztbxh', 'Sxsztbxh', "Notice", 1),
-         ]
-    def start_requests(self):
-        for menu in self.menus:
-            for page in range(1,menu.crawl_page+1):
-                start_url = f'http://www.sxtba.com/prod-api/web/prequalificationList?releaseType=&pageNum={page}&pageSize=10'
-                yield feapder.Request(url=start_url, item=menu._asdict())
-
-    def parse(self, request, response):
-        print(response.text)
-        menu = request.item
-        self.count += 1   # 一个计数器
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.json.get("rows")
-        for info in info_list:
-            href = info.get("")
-
-            href = f'http://www.sxtba.com/home/zcInfoChildDetail?id={info}&noticeListRoute=NprequalificationNotice&projectTypes=otherBidding'
-            title = info.get("noticeTitle")
-            create_time = info.get("createTime")
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = "*******记得编辑平台名称"
-            data_item.area = "全国"  # 城市默认:全国
-            data_item.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//****',"*****"]
-            list_item.author = "****"
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    Sxsztbxh(redis_key="fwork:Sxsztbxh").start()

+ 0 - 32
NoteWork/cesspider/广东测试.py

@@ -1,32 +0,0 @@
-import requests
-
-
-headers = {
-
-    # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
-}
-url = "https://gdgpo.czt.gd.gov.cn/freecms/rest/v1/notice/selectInfoMoreChannel.do"
-params = {
-    "": "",
-    "siteId": "cd64e06a-21a7-4620-aebc-0576bab7e07a",
-    "channel": "fca71be5-fc0c-45db-96af-f513e9abda9d",
-    "currPage": "1",
-    "pageSize": "10",
-    "noticeType": "00103",
-    "regionCode": "440001",
-    "verifyCode": "2158",
-    "subChannel": "false",
-    "purchaseManner": "",
-    "title": "",
-    "openTenderCode": "",
-    "purchaser": "",
-    "agency": "",
-    "purchaseNature": "",
-    "operationStartTime": "",
-    "operationEndTime": "",
-    "selectTimeName": "noticeTime"
-}
-response = requests.get(url, params=params)
-
-print(response.text)
-print(response)

+ 0 - 137
NoteWork/cesspider/广东省政府采购网.py

@@ -1,137 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-01-18 09:41:49
----------
-@summary: Gdszfcgw
----------
-@author: dist
-"""
-import sys
-from urllib.parse import urljoin
-
-import requests
-
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder,time
-from items.spider_item import DataBakItem,MgpListItem,ListItem
-from feapder.dedup import Dedup
-from collections import namedtuple
-from untils.get_imgcode import get_code
-#
-# # custom_settings = { 'DOWNLOAD_DELAY': 10, 'CONCURRENT_REQUESTS_PER_IP': 4, 'DOWNLOADER_MIDDLEWARES': {}, }
-# settings = { 'LOG_LEVEL': "INFO" }
-class Gdszfcgw(feapder.Spider):
-    def start_callback(self):
-         Menu = namedtuple('Menu', ['channel', 'code', 'noticetype','notchannel', 'crawl_page'])
-         self.site= "广东省政府采购网"
-         self.host = 'https://gdgpo.czt.gd.gov.cn'
-
-         self.menus = [
-             Menu('采购意向公开', 'gd_gdszfcgwxwz_cgyxgk','59','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
-             Menu('单一来源公示', 'gd_gdszfcgwxwz_cggg_pccgyxgk','001051','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
-             Menu('采购计划', 'gd_gdszfcgwxwz_cgjh', '001101','95ff31f3-a1af-4bc4-b1a2-54c894476193', 1),   #1
-             Menu('采购需求', 'gd_gdszfcgwxwz_cgxq', '001059','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
-             Menu('资格预审公告', 'gd_gdszfcgwxwz_zgysgg', '001052,001053','fca71be5-fc0c-45db-96af-f513e9abda9d', 1), #2
-             Menu('采购公告', 'gd_gdszfcgwxwz_cggg', '00101','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
-             Menu('中标成交公告', 'gd_gdszfcgwxwz_zbcjgg', '00102','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
-             Menu('更正公告', 'gd_gdszfcgwxwz_gzgg', '00103','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
-             Menu('终止公告', 'gd_gdszfcgwxwz_zzgg', '001004,001006','fca71be5-fc0c-45db-96af-f513e9abda9d', 1), #3
-             Menu('合同公告', 'gd_gdszfcgwxwz_htgg', '001054','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
-             Menu('验收公告', 'gd_gdszfcgwxwz_ysgg', '001009,00105A','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),  #4
-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '201022,201023,201111,00107D','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '202022,202023,202111,00107E,001076','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '001071','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '204022,204023,204111,204112','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '001054', '3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  # 4
-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '001009,00105A', '3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  # 4
-
-             # Menu('批量采购', 'gd_gdszfcgwxwz_plcg',
-             #      'https://gdgpo.czt.gd.gov.cn/freecms/site/guangdong/dzmcgg/index.html', 1),
-             # Menu('进口产品清单', 'gd_gdszfcgwxwz_jkcpqd',
-             #      'https://gdgpo.czt.gd.gov.cn/freecms/site/guangdong/jkcpqd/index.html','','d7284b7e-29e9-4fe4-bad3-b187ec8edbf9' 1),
-         ]
-    def start_requests(self):
-        code = self.get_code()
-        for menu in self.menus:
-             for page in range(1,menu.crawl_page+1):
-                 start_url = f'https://gdgpo.czt.gd.gov.cn/freecms/rest/v1/notice/selectInfoMoreChannel.do?&siteId=cd64e06a-21a7-4620-aebc-0576bab7e07a&channel={menu.notchannel}&currPage={page}&pageSize=10&noticeType={menu.noticetype}&regionCode=440001&verifyCode={code}&subChannel=false&purchaseManner=&title=&openTenderCode=&purchaser=&agency=&purchaseNature=&operationStartTime=&operationEndTime=&selectTimeName=noticeTime'
-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False)
-    def get_code(self):
-        img_url = 'https://gdgpo.czt.gd.gov.cn/freecms/verify/verifyCode.do?createTypeFlag=n'
-        header = {"Host": "www.ccgp-tianjin.gov.cn",
-                  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0",
-                  "Origin": "http://www.ccgp-tianjin.gov.cn",
-
-                  }
-        res = requests.get(img_url, headers=header)
-        with open('image/guangdong.jpg', 'wb+') as f:
-            f.write(res.content)
-        res = get_code('image/guangdong.jpg')
-        if res.get("msg")=="success":
-            img_code = res.get("r").get("code")
-        else:
-            img_code = None
-        return img_code
-
-
-    def parse(self, request, response):
-        time.sleep(0.3)
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.json.get("data")
-        for info in info_list:
-            href = info.get("pageurl")
-            title = info.get("shorttitle")
-            create_time = info.get("addtimeStr")
-            href = urljoin(self.host, href)
-
-            area = "广东"  # 省份
-            city = ""  # 城市
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = self.site
-            data_item.area = area  # 城市默认:全国
-            data_item.city = city  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//div[@class="info-article in active"]']
-            list_item.proxies = False
-            list_item.parse_url = href
-            list_item.pri = 1
-            list_item.files={
-                "list_xpath":'//div[@class="info-article in active"]//div/a',
-                "url_xpath":'./@href',
-                "name_xpath":'./text()',
-                "files_type":('zip','doxc','ftp','pdf'), # 需要下载的附件类型
-                # "file_type":'zip', # 默认的附件类型,用于url中未带附件类型的
-                "url_key":'http', # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
-                # "host":'http://www.ceshi.com',  # 需要拼接url的host
-            }
-            href_list.append(href)
-            yield list_item
-        list = ListItem()
-        list.site = self.site
-        list.channel = menu.get("channel")
-        list.spidercode = menu.get("code")
-        list.url = request.url
-        list.count = len(info_list)
-        list.rel_count = len(href_list)
-        yield list
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    Gdszfcgw(redis_key="dist:Gdszfcgw").start()

+ 0 - 9
NoteWork/cesspider/测试查询.py

@@ -1,9 +0,0 @@
-from feapder.dedup import Dedup
-url='http://www.ccgp-tianjin.gov.cn/viewer.do?id=299263823&ver=2'
-
-dedup = Dedup(Dedup.BloomFilter)
-ss = dedup.filter_exist_data([url])
-if ss == []:
-    print('不存在,未在此库操作')
-else:
-    print('以去重')

+ 0 - 114
NoteWork/cesspider/滁州市人民政府网.py

@@ -1,114 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-01-14 20:02:21
----------
-@summary: 滁州市人民政府网
----------
-@author: mgp
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from items.spider_item import DataBakItem,MgpListItem,ListItem
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-
-class Czsrmzf(feapder.Spider):
-
-    def start_callback(self):
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-         self.site= "滁州市人民政府网"
-
-         self.menus = [
-             Menu('政府信息公开目录-公立医疗机构药品医用设备采购', 'ah_czsrmzfw_gcztb_zbgg', "自定义参数", 1),
-             Menu('重大建设项目-招标投标信息', 'ah_czsrmzfw_zfcg_cggg', "自定义参数", 1),
-             Menu('政府采购', 'ah_czsrmzfw_gcztb_zbgs', "Notice", 1),
-             Menu('工程建设招投标', 'ah_czsrmzfw_zfcg_zbcjgg', "Notice", 1),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-             for page in range(1,menu.crawl_page+1):
-                 start_url = f'https://www.chuzhou.gov.cn/chuzhou/site/label/8888'
-                 parmars = params = {
-                        "IsAjax": "1",
-                        "dataType": "html",
-                        "_": "0.5840033326645138",
-                        "labelName": "publicInfoList",
-                        "siteId": "2653861",
-                        "pageSize": "20",
-                        "pageIndex": "3",
-                        "action": "list",
-                        "isDate": "true",
-                        "dateFormat": "yyyy-MM-dd",
-                        "length": "50",
-                        "organId": "2681509",
-                        "type": "4",
-                        "catId": "161735369",
-                        "cId": "",
-                        "result": "暂无相关信息",
-                        "title": "",
-                        "fileNum": "",
-                        "keyWords": "",
-                        "file": "/c1/chuzhou/publicInfoList_newest"
-                    }
-                 yield feapder.Request(url=start_url,params=parmars, item=menu._asdict(),proxies=False)
-
-    def parse(self, request, response):
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.xpath("//ul")
-        for info in info_list:
-            href = info.xpath("./li/a/@href").extract_first().strip()
-            title = info.xpath("./li/a/@title").extract_first().strip()
-            create_time = info.xpath("./li/span/text()").extract_first().strip()
-            area = "安徽"  # 省份
-            city = "滁州市"  # 城市
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = self.site
-            data_item.area = area  # 城市默认:全国
-            data_item.city = city  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            # if ss == []:
-            #     continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            # list_item.parser_name = "detail_firefox"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//div[@class="contentbox minh500"]']
-            list_item.proxies = False
-            list_item.parse_url = href
-            list_item.pri = 1
-            list_item.files={
-                "list_xpath":'//a[contains(@data-file-ext,"D")]',
-                "url_xpath":'./@href',
-                "name_xpath":'./text()',
-                "files_type":('zip','docx','ftp'), # 需要下载的附件类型
-                "url_key": 'http',  # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
-                "host": 'https://www.chuzhou.gov.cn'
-            }
-            href_list.append(href)
-            yield list_item
-        list = ListItem()
-        list.site = self.site
-        list.channel = menu.get("channel")
-        list.spidercode = menu.get("code")
-        list.url = request.url
-        list.count = len(info_list)
-        list.rel_count = len(href_list)
-        yield list
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    Czsrmzf(redis_key="magp:Czsrmzf").start()

+ 0 - 197
NoteWork/cesspider/甘肃政府采购网.py

@@ -1,197 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-01 16:37:53
----------
-@summary:
----------
-@author: 马国鹏
-"""
-import feapder
-from items.spider_item import DataBakItem, MgpListItem
-from untils.proxy_pool import ProxyPool
-from feapder.dedup import Dedup
-from collections import namedtuple
-import time
-from lxml import etree
-import re
-
-
-class Gszfcg(feapder.Spider):
-    # 自定义数据库,若项目中有setting.py文件,此自定义可删除
-    def start_callback(self):
-        self.count = 0
-        self.prox_pool = ProxyPool()
-        self.cookie = None
-        self.host = 'http://www.ccgp-gansu.gov.cn/'
-        Menu = namedtuple('Menu', ['channel', 'code', "parse", 'render_time', 'url', 'crawl_page'])
-
-        self.menus = [
-            Menu('定点采购', 'a_gszfcgw_ddcg', "self.parse_num1", 2,
-                 "http://www.ccgp-gansu.gov.cn/web/article/142/{crawl_page}/index.htm", 1),
-            Menu('协议供货-公告栏', 'a_gszfcgw_xygh_ggl', "self.parse_num3",2,
-                 "http://www.ccgp-gansu.gov.cn/web/article/13001/{crawl_page}/index.htm", 1),
-            # Menu('协议供货定点采购合同', 'a_gszfcgw_xyghddcght',  "self.parse_num1",2, "Notice", 1),
-            Menu('招标项目合同', 'a_gszfcgw_zbxmht', "self.parse_num1", 2,
-                 "http://www.ccgp-gansu.gov.cn/web/contract/{crawl_page}/index.htm?contractsInfo.id=d0", 13),
-            Menu('最新标讯', 'a_gszfcgw_zxbx', "self.parse_num2", 2,
-                 "http://www.ccgp-gansu.gov.cn/web/articlenews/1/{crawl_page}/index.htm?articleSearchInfo.days=21&articleSearchInfo.division=d0",
-                 10),
-            Menu('综合查询-全部', 'gs_gszfcgw_zhcx_qb', "self.parse",2,
-                 "http://www.ccgp-gansu.gov.cn/web/doSearchmxarticlelssj.action", 1),
-        ]
-
-    def start_requests(self):
-        for menu in self.menus:
-            print(menu.parse)
-            for page in range(menu.crawl_page):
-                url = menu.url.format(crawl_page=page*10)
-                print(url)
-                yield feapder.Request(url=url, item=menu._asdict(), render=True, callback=eval(menu.parse),render_time=2)
-
-
-    def parse(self, request, response):
-        browser = response.browser
-        browser.find_element_by_name("button").click()
-        self.cookie = response.cookies
-        smenu = request.item
-        response = etree.HTML(browser.page_source)
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        for info in response.xpath("//ul[@class='Expand_SearchSLisi']/li"):
-            title = info.xpath('./a/text()')[0]
-            href = self.host + info.xpath('./a/@href')[0]
-            create_time = re.findall(r'\| 发布时间:(.*?) \|', etree.tounicode(info))[0]
-
-            item_data = DataBakItem()  # 存储数据的管道
-            item_data.href = href  # 标书链接
-            item_data.channel = smenu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            item_data.spidercode = smenu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            item_data.title = title  # 标题
-            item_data.publishtime = create_time  # 标书发布时间
-            item_data.site = "甘肃政府采购网"
-            item_data.area = "甘肃省"  # 城市默认:全国
-            item_data.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item = MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details_firefox"
-            list_item.item = item_data.to_dict
-            list_item.deal_detail = ["//div[@class='articleCon']"]
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-
-    def parse_num1(self, request, response):
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        for info in response.xpath("//ul[@class='newsList']/li"):
-            title = info.xpath('./span[2]/a/text()').extract_first()
-            if title is None:
-                continue
-            href = info.xpath('./span[2]/a/@href').extract_first()
-            create_time = info.xpath('./span[1]/text()').extract_first().strip() + ' 00:00:00'
-
-            item_data = DataBakItem()  # 存储数据的管道
-            item_data.href = href  # 标书链接
-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            item_data.title = title  # 标题
-            item_data.publishtime = create_time  # 标书发布时间
-            item_data.site = "甘肃政府采购网"
-            item_data.area = "甘肃省"  # 城市默认:全国
-            item_data.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item = MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details_firefox"
-            list_item.item = item_data.to_dict
-            list_item.deal_detail = ["//div[@class='mBd']"]
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-    def parse_num2(self, request, response):
-        menu = request.item
-        cookie = response.cookies
-        info_list = response.xpath("//*[@class='mBd']/ul/li")
-        if not info_list and menu.get("render_time")<5:
-            yield feapder.Request(url=request.url, item=menu,callback=self.parse_num2,cookies=response.cookies)
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        for info in info_list:
-            title = info.xpath('./a/text()').extract_first()
-            if title is None:
-                continue
-            href = info.xpath('./a/@href').extract_first()
-            create_time = info.xpath('./p/span/text()').extract_first().strip()
-            create_time = re.findall('审核时间:(.*?) \|',create_time)[0]
-
-            item_data = DataBakItem()  # 存储数据的管道
-            item_data.href = href  # 标书链接
-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            item_data.title = title  # 标题
-            item_data.publishtime = create_time  # 标书发布时间
-            item_data.site = "甘肃政府采购网"
-            item_data.area = "甘肃省"  # 城市默认:全国
-            item_data.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item = MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details_firefox"
-            list_item.item = item_data.to_dict
-            list_item.deal_detail = ["//div[@class='mBd']"]
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-    def parse_num3(self, request, response):
-        menu = request.item
-        info_list = response.xpath("//*[@class='mBd']/ul/li")
-        if not info_list and menu.get("render_time")<5:
-            yield feapder.Request(url=request.url, item=menu,callback=self.parse_num3,cookies=response.cookies)
-
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        for info in info_list:
-            title = info.xpath('./span[2]/a/text()').extract_first()
-            if title is None:
-                continue
-            href = info.xpath('./span[2]/a/@href').extract_first()
-            create_time = info.xpath('./span[1]/text()').extract_first().strip() + ' 00:00:00'
-
-            item_data = DataBakItem()  # 存储数据的管道
-            item_data.href = href  # 标书链接
-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            item_data.title = title  # 标题
-            item_data.publishtime = create_time  # 标书发布时间
-            item_data.site = "甘肃政府采购网"
-            item_data.area = "甘肃省"  # 城市默认:全国
-            item_data.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item = MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details_firefox"
-            list_item.item = item_data.to_dict
-            list_item.deal_detail = ["//div[@class='mBd']"]
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-
-
-
-
-if __name__ == "__main__":
-    Gszfcg(redis_key="magp:gszfcg").start()

+ 0 - 213
NoteWork/cesspider/甘肃政府采购网_ces.py

@@ -1,213 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-01 16:37:53
----------
-@summary:
----------
-@author: 马国鹏
-"""
-import feapder
-from feapder.network.cookie_pool import PageCookiePool
-
-from items.spider_item import DataBakItem, MgpListItem
-from untils.proxy_pool import ProxyPool
-from feapder.dedup import Dedup
-from collections import namedtuple
-import time
-from lxml import etree
-import re
-
-
-class Gszfcg(feapder.Spider):
-    # 自定义数据库,若项目中有setting.py文件,此自定义可删除
-    cookie_pool = PageCookiePool(redis_key='fwork:gszfcg',
-                                 page_url='http://www.ccgp-gansu.gov.cn/web/article/142/0/index.htm',driver_type='FIREFOX',executable_path="D:\\geckodriver.exe")
-    def start_callback(self):
-        self.count = 0
-        self.cookie = None
-        self.host = 'http://www.ccgp-gansu.gov.cn/'
-        Menu = namedtuple('Menu', ['channel', 'code', "parse", 'render_time', 'url', 'crawl_page'])
-
-
-        self.menus = [
-            Menu('定点采购', 'a_gszfcgw_ddcg', "self.parse_num1", 2,
-                 "http://www.ccgp-gansu.gov.cn/web/article/142/0/index.htm", 1),
-            Menu('协议供货-公告栏', 'a_gszfcgw_xygh_ggl', "self.parse_num3",2,
-                 "http://www.ccgp-gansu.gov.cn/web/article/13001/0/index.htm", 1),
-            # Menu('协议供货定点采购合同', 'a_gszfcgw_xyghddcght',  "self.parse_num1",2, "Notice", 1),
-            Menu('招标项目合同', 'a_gszfcgw_zbxmht', "self.parse_num1", 2,
-                 "http://www.ccgp-gansu.gov.cn/web/contract/0/index.htm?contractsInfo.id=d0", 1),
-            Menu('最新标讯', 'a_gszfcgw_zxbx', "self.parse_num2", 2,
-                 "http://www.ccgp-gansu.gov.cn/web/articlenews/1/0/index.htm?articleSearchInfo.days=21&articleSearchInfo.division=d0",
-                 1),
-            Menu('综合查询-全部', 'gs_gszfcgw_zhcx_qb', "self.parse",2,
-                 "http://www.ccgp-gansu.gov.cn/web/doSearchmxarticlelssj.action", 1),
-        ]
-
-    def start_requests(self):
-        for menu in self.menus:
-            print(menu.parse)
-            yield feapder.Request(url=menu.url, item=menu._asdict(),callback=eval(menu.parse))
-
-
-    def parse(self, request, response):
-        browser = response.browser
-        browser.find_element_by_name("button").click()
-        # self.cookie = response.cookies
-        smenu = request.item
-        response = etree.HTML(browser.page_source)
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        for info in response.xpath("//ul[@class='Expand_SearchSLisi']/li"):
-            title = info.xpath('./a/text()')[0]
-            href = self.host + info.xpath('./a/@href')[0]
-            create_time = re.findall(r'\| 发布时间:(.*?) \|', etree.tounicode(info))[0]
-
-            item_data = DataBakItem()  # 存储数据的管道
-            item_data.href = href  # 标书链接
-            item_data.channel = smenu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            item_data.spidercode = smenu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            item_data.title = title  # 标题
-            item_data.publishtime = create_time  # 标书发布时间
-            item_data.site = "甘肃政府采购网"
-            item_data.area = "甘肃省"  # 城市默认:全国
-            item_data.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item = MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = item_data.to_dict
-            list_item.deal_detail = ["//div[@class='articleCon']"]
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-
-    def parse_num1(self, request, response):
-        print(response.text)
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        print('newsList_count',len(response.xpath("//ul[@class='newsList']/li")))
-        for info in response.xpath("//ul[@class='newsList']/li"):
-            title = info.xpath('./span[2]/a/text()').extract_first()
-            if title is None:
-                continue
-            href = info.xpath('./span[2]/a/@href').extract_first()
-            create_time = info.xpath('./span[1]/text()').extract_first().strip() + ' 00:00:00'
-            print(create_time,title)
-
-            item_data = DataBakItem()  # 存储数据的管道
-            item_data.href = href  # 标书链接
-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            item_data.title = title  # 标题
-            item_data.publishtime = create_time  # 标书发布时间
-            item_data.site = "甘肃政府采购网"
-            item_data.area = "甘肃省"  # 城市默认:全国
-            item_data.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item = MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = item_data.to_dict
-            list_item.deal_detail = ["//div[@class='mBd']"]
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-    def parse_num2(self, request, response):
-        menu = request.item
-        cookie = response.cookies
-        info_list = response.xpath("//*[@class='mBd']/ul/li")
-        if not info_list and menu.get("render_time")<5:
-            yield feapder.Request(url=request.url, item=menu,callback=self.parse_num2,cookies=response.cookies)
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        for info in info_list:
-            title = info.xpath('./a/text()').extract_first()
-            if title is None:
-                continue
-            href = info.xpath('./a/@href').extract_first()
-            create_time = info.xpath('./p/span/text()').extract_first().strip()
-            create_time = re.findall('审核时间:(.*?) \|',create_time)[0]
-
-            item_data = DataBakItem()  # 存储数据的管道
-            item_data.href = href  # 标书链接
-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            item_data.title = title  # 标题
-            item_data.publishtime = create_time  # 标书发布时间
-            item_data.site = "甘肃政府采购网"
-            item_data.area = "甘肃省"  # 城市默认:全国
-            item_data.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item = MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = item_data.to_dict
-            list_item.deal_detail = ["//div[@class='mBd']"]
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-    def parse_num3(self, request, response):
-        menu = request.item
-        info_list = response.xpath("//*[@class='mBd']/ul/li")
-        if not info_list and menu.get("render_time")<5:
-            yield feapder.Request(url=request.url, item=menu,callback=self.parse_num3,cookies=response.cookies)
-
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        for info in info_list:
-            title = info.xpath('./span[2]/a/text()').extract_first()
-            if title is None:
-                continue
-            href = info.xpath('./span[2]/a/@href').extract_first()
-            create_time = info.xpath('./span[1]/text()').extract_first().strip() + ' 00:00:00'
-
-            item_data = DataBakItem()  # 存储数据的管道
-            item_data.href = href  # 标书链接
-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            item_data.title = title  # 标题
-            item_data.publishtime = create_time  # 标书发布时间
-            item_data.site = "甘肃政府采购网"
-            item_data.area = "甘肃省"  # 城市默认:全国
-            item_data.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item = MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = item_data.to_dict
-            list_item.deal_detail = ["//div[@class='mBd']"]
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-    def download_midware(self, request):
-        request.headers = {
-            "Connection": "keep-alive",
-            "Cache-Control": "max-age=0",
-            "Upgrade-Insecure-Requests": "1",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-            "Referer": "http://www.ccgp-gansu.gov.cn/web/article/142/0/index.htm",
-            "Accept-Language": "zh-CN,zh;q=0.9"
-        }
-
-        request.cookies = self.cookie_pool.get_cookie()
-        return request
-
-
-
-if __name__ == "__main__":
-    Gszfcg(redis_key="magp:gszfcg").start()

+ 0 - 194
NoteWork/cesspider/甘肃政府采购网_new.py

@@ -1,194 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-01 16:37:53
----------
-@summary:
----------
-@author: 马国鹏
-"""
-import feapder
-from items.spider_item import DataBakItem, MgpListItem
-from untils.proxy_pool import ProxyPool
-from feapder.dedup import Dedup
-from collections import namedtuple
-import time
-from lxml import etree
-import re
-
-
-class Gszfcg(feapder.Spider):
-    # 自定义数据库,若项目中有setting.py文件,此自定义可删除
-    def start_callback(self):
-        self.count = 0
-        self.prox_pool = ProxyPool()
-        self.cookie = None
-        self.host = 'http://www.ccgp-gansu.gov.cn/'
-        Menu = namedtuple('Menu', ['channel', 'code', "parse", 'render_time', 'url', 'crawl_page'])
-
-        self.menus = [
-            Menu('定点采购', 'a_gszfcgw_ddcg', "self.parse_num1", 2,
-                 "http://www.ccgp-gansu.gov.cn/web/article/142/0/index.htm", 1),
-            Menu('协议供货-公告栏', 'a_gszfcgw_xygh_ggl', "self.parse_num3",2,
-                 "http://www.ccgp-gansu.gov.cn/web/article/13001/0/index.htm", 1),
-            # Menu('协议供货定点采购合同', 'a_gszfcgw_xyghddcght',  "self.parse_num1",2, "Notice", 1),
-            Menu('招标项目合同', 'a_gszfcgw_zbxmht', "self.parse_num1", 2,
-                 "http://www.ccgp-gansu.gov.cn/web/contract/0/index.htm?contractsInfo.id=d0", 1),
-            Menu('最新标讯', 'a_gszfcgw_zxbx', "self.parse_num2", 2,
-                 "http://www.ccgp-gansu.gov.cn/web/articlenews/1/0/index.htm?articleSearchInfo.days=21&articleSearchInfo.division=d0",
-                 1),
-            Menu('综合查询-全部', 'gs_gszfcgw_zhcx_qb', "self.parse",2,
-                 "http://www.ccgp-gansu.gov.cn/web/doSearchmxarticlelssj.action", 1),
-        ]
-
-    def start_requests(self):
-        for menu in self.menus:
-            print(menu.parse)
-            yield feapder.Request(url=menu.url, item=menu._asdict(), render=True, callback=eval(menu.parse),render_time=2)
-
-
-    def parse(self, request, response):
-        browser = response.browser
-        browser.find_element_by_name("button").click()
-        # self.cookie = response.cookies
-        smenu = request.item
-        response = etree.HTML(browser.page_source)
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        for info in response.xpath("//ul[@class='Expand_SearchSLisi']/li"):
-            title = info.xpath('./a/text()')[0]
-            href = self.host + info.xpath('./a/@href')[0]
-            create_time = re.findall(r'\| 发布时间:(.*?) \|', etree.tounicode(info))[0]
-
-            item_data = DataBakItem()  # 存储数据的管道
-            item_data.href = href  # 标书链接
-            item_data.channel = smenu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            item_data.spidercode = smenu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            item_data.title = title  # 标题
-            item_data.publishtime = create_time  # 标书发布时间
-            item_data.site = "甘肃政府采购网"
-            item_data.area = "甘肃省"  # 城市默认:全国
-            item_data.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item = MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = item_data.to_dict
-            list_item.deal_detail = ["//div[@class='articleCon']"]
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-
-    def parse_num1(self, request, response):
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        for info in response.xpath("//ul[@class='newsList']/li"):
-            title = info.xpath('./span[2]/a/text()').extract_first()
-            if title is None:
-                continue
-            href = info.xpath('./span[2]/a/@href').extract_first()
-            create_time = info.xpath('./span[1]/text()').extract_first().strip() + ' 00:00:00'
-
-            item_data = DataBakItem()  # 存储数据的管道
-            item_data.href = href  # 标书链接
-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            item_data.title = title  # 标题
-            item_data.publishtime = create_time  # 标书发布时间
-            item_data.site = "甘肃政府采购网"
-            item_data.area = "甘肃省"  # 城市默认:全国
-            item_data.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item = MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = item_data.to_dict
-            list_item.deal_detail = ["//div[@class='mBd']"]
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-    def parse_num2(self, request, response):
-        menu = request.item
-        cookie = response.cookies
-        info_list = response.xpath("//*[@class='mBd']/ul/li")
-        if not info_list and menu.get("render_time")<5:
-            yield feapder.Request(url=request.url, item=menu,callback=self.parse_num2,cookies=response.cookies)
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        for info in info_list:
-            title = info.xpath('./a/text()').extract_first()
-            if title is None:
-                continue
-            href = info.xpath('./a/@href').extract_first()
-            create_time = info.xpath('./p/span/text()').extract_first().strip()
-            create_time = re.findall('审核时间:(.*?) \|',create_time)[0]
-
-            item_data = DataBakItem()  # 存储数据的管道
-            item_data.href = href  # 标书链接
-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            item_data.title = title  # 标题
-            item_data.publishtime = create_time  # 标书发布时间
-            item_data.site = "甘肃政府采购网"
-            item_data.area = "甘肃省"  # 城市默认:全国
-            item_data.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item = MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = item_data.to_dict
-            list_item.deal_detail = ["//div[@class='mBd']"]
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-    def parse_num3(self, request, response):
-        menu = request.item
-        info_list = response.xpath("//*[@class='mBd']/ul/li")
-        if not info_list and menu.get("render_time")<5:
-            yield feapder.Request(url=request.url, item=menu,callback=self.parse_num3,cookies=response.cookies)
-
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        for info in info_list:
-            title = info.xpath('./span[2]/a/text()').extract_first()
-            if title is None:
-                continue
-            href = info.xpath('./span[2]/a/@href').extract_first()
-            create_time = info.xpath('./span[1]/text()').extract_first().strip() + ' 00:00:00'
-
-            item_data = DataBakItem()  # 存储数据的管道
-            item_data.href = href  # 标书链接
-            item_data.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            item_data.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            item_data.title = title  # 标题
-            item_data.publishtime = create_time  # 标书发布时间
-            item_data.site = "甘肃政府采购网"
-            item_data.area = "甘肃省"  # 城市默认:全国
-            item_data.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item = MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = item_data.to_dict
-            list_item.deal_detail = ["//div[@class='mBd']"]
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-
-
-
-
-if __name__ == "__main__":
-    Gszfcg(redis_key="magp:gszfcg").start()

+ 0 - 106
NoteWork/cesspider/福建省政府采购网.py

@@ -1,106 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-01-06 16:37:37
----------
-@summary: 福建省政府采购网.py
----------
-@author: FworkSpider
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from items.spider_item import DataBakItem,MgpListItem
-from feapder.dedup import Dedup
-from collections import namedtuple
-import random
-import requests
-from untils.chaojiying import Chaojiying_Client
-
-class Fjszfcgw(feapder.Spider):
-    str = '天仙丛付印五仔六五乐四甩瓜九七一失令斤册禾十仗丘非田白付乐仪八代匆乎二们句生四用'
-    def start_callback(self):
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-         self.menus = [
-             Menu('项目公告-全部', 'fj_fjszfcgw_xmgg_qb', "自定义参数", 10),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-             # for page in range(1,menu.crawl_page+1):
-                 start_url = f'http://www.ccgp-fujian.gov.cn/3500/noticelist/e8d2cd51915e4c338dc1c6ee2f02b127/?page={1}&verifycode={"".join(random.sample(self.str,4))}'
-                 yield feapder.Request(url=start_url, item=menu._asdict(),page=1,render=True,render_time=2)
-
-    def parse(self, request, response):
-        # print(response.text)
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.xpath("//tbody/tr")
-        if info_list == []:
-            img_url = 'http://www.ccgp-fujian.gov.cn/noticeverifycode/?1'
-            print('出现验证码')
-            img_res = requests.get(img_url)
-            with open('a.jpg', 'wb+') as f:
-                f.write(img_res.content)
-            # chaojiying = Chaojiying_Client('ddddjy', 'ddddjy2021', '超级鹰')  # 用户中心>>软件ID 生成一个替换 96001
-            # im = open('a.jpg', 'rb').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
-            # print(chaojiying.PostPic(im, 1902))
-            # res = chaojiying.PostPic(im, 2004)
-            # print(res)
-            # if res.get("err_no") != 0:
-            #     chaojiying.ReportError(res.get("pic_id"))
-            # code = res.get("pic_str")
-            url = request.url[:-4]+"".join(random.sample(self.str,4))
-            yield feapder.Request(url=url, item=menu,random_user_agent=False,page=request.page,render=True,render_time=2)
-            return
-        for info in info_list:
-            href = info.xpath('./td/a/@href').extract_first()
-            title = info.xpath('./td/a/text()').extract_first()
-            create_time = info.xpath('./td[5]/text()').extract_first()
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = "福建省政府采购网"
-            data_item.area = "福建"  # 城市默认:全国
-            data_item.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//div[@class="notice-con"]']
-            list_item.proxies = False
-            list_item.parse_url = href
-            list_item.pri = 1
-            list_item.files={
-                "list_xpath":'//div[@class="notice-foot"]/a',
-                "url_xpath":'./@href',
-                "name_xpath":'./text()',
-                "files_type":('zip','doxc','ftp'),
-                "file_type":'zip',
-                "url_key":'attach',
-            }
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-        page_url =  f'http://www.ccgp-fujian.gov.cn/3500/noticelist/e8d2cd51915e4c338dc1c6ee2f02b127/?page={request.page+1}&verifycode={"".join(random.sample(self.str,4))}'
-        if request.page < menu.get("crawl_page"):
-            yield feapder.Request(url=page_url, use_session=True, item=menu, proxies=False,
-                                  random_user_agent=False, page=request.page+1)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-    def download_midware(self, request):
-        request.headers={
-            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
-        }
-
-if __name__ == "__main__":
-    Fjszfcgw(redis_key="FworkSpider:Fjszfcgw2").start()

+ 0 - 24
NoteWork/cesspider/黔云招采电子招标采购交易平台

@@ -1,24 +0,0 @@
-Qyzcdzzbcgjypt|2022-01-10 17:58:08,690|scheduler.py|<lambda>|line:112|INFO| 
-********** feapder begin **********
-Thread-5|2022-01-10 17:58:17,753|parser_control.py|deal_requests|line:249|ERROR| 
-                            -------------- Qyzcdzzbcgjypt.parse error -------------
-                            error          XPath error: Invalid expression in //*[
-                            response       <Response [500]>
-                            deal request   <Request https://www.e-qyzc.com/gg/toXinXiList>
-                            
-Thread-5|2022-01-10 17:58:17,773|parser_control.py|deal_requests|line:349|INFO| 
-                                    入库 等待重试
-                                    url     https://www.e-qyzc.com/gg/toXinXiList
-                                    重试次数 1
-                                    最大允许重试次数 2
-Thread-5|2022-01-10 17:58:20,708|parser_control.py|deal_requests|line:249|ERROR| 
-                            -------------- Qyzcdzzbcgjypt.parse error -------------
-                            error          XPath error: Invalid expression in //*[
-                            response       <Response [500]>
-                            deal request   <Request https://www.e-qyzc.com/gg/toXinXiList>
-                            
-Thread-5|2022-01-10 17:58:20,709|parser_control.py|deal_requests|line:349|INFO| 
-                                    入库 等待重试
-                                    url     https://www.e-qyzc.com/gg/toXinXiList
-                                    重试次数 2
-                                    最大允许重试次数 2

+ 0 - 93
NoteWork/cesspider/黔云招采电子招标采购交易平台.py

@@ -1,93 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-01-10 09:47:56
----------
-@summary:	黔云招采电子招标采购交易平台
----------
-@author: topnet
-"""
-
-import feapder
-from items.spider_item import DataBakItem,MgpListItem,ListItem
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-
-class Qyzcdzzbcgjypt(feapder.Spider):
-
-    def start_callback(self):
-         self.count = 0
-         self.site='黔云招采电子招标采购交易平台'
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-         self.menus = [
-             Menu('询价采购-采购公告', 'gz_qyzcdzzbcgjypt_xjcg_cggg', "Notice", 1),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-             for page in range(1,menu.crawl_page+1):
-                 start_url = f'https://www.e-qyzc.com/gg/toXinXiList?gongGaoType=5'
-                 data = {
-                    "currentPage": str(page),
-                    "xmBH": "",
-                    "ggName": "",
-                    "hangYeType": "",
-                    "zbrName": "",
-                    "zbdlName": ""
-                 }
-                 yield feapder.Request(url=start_url, item=menu._asdict(), proxies=False, data=data,method="POST")
-
-    def parse(self, request, response):
-        menu = request.item
-        self.count += 1   # 一个计数器
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.xpath('//table[@id="p1"]/tr[position()>1]')
-        for info in info_list:
-            href = info.xpath('./td/a/@href').extract_first().strip()
-            title = info.xpath('./td/a/text()').extract_first().strip()
-            create_time = info.xpath('./td[5]/text()').extract_first().strip()
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = self.site
-            data_item.area = "贵州省"  # 城市默认:全国
-            data_item.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            # if ss == []:
-            #     continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details_firefox"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//div[@class="page_contect bai_bg"]']
-            if "guid" not in href:
-                continue
-            uid = href.split("guid=")[-1].split("&")[0]
-            list_item.parse_url = f"https://www.e-qyzc.com/waiburukou/xjcgGongGao/view/{uid}.html"
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-        list = ListItem()
-        list.site = self.site
-        list.channel = menu.get("channel")
-        list.spidercode = menu.get("code")
-        list.url = request.url
-        list.count = len(info_list)
-        list.rel_count = len(href_list)
-        yield list
-
-    # def exception_request(self, request, response):
-
-    def end_callback(self):
-        # list = ListItem()
-        # list.site=
-        print("爬虫结束")
-
-
-if __name__ == "__main__":
-    Qyzcdzzbcgjypt(redis_key="fwork:Qyzcdzzbcgjypt2").start()

+ 0 - 15
NoteWork/details/__init__.py

@@ -1,15 +0,0 @@
-import requests
-
-
-headers = {
-
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
-}
-cookies = {
-    "__jsluid_h": "018c23a4fee58c26aa118512640f8022"
-}
-url = "http://www.snszgh.gov.cn/gsgg/index.html"
-response = requests.get(url, headers=headers,verify=False)
-
-print(response.text)
-print(response)

+ 0 - 194
NoteWork/details/detail_dtcookie.py

@@ -1,194 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:  生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip
----------
-@author: 马国鹏
-"""
-import sys
-from urllib.parse import urljoin
-
-from untils.attachment import AttachmentDownloader
-
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from feapder.utils.tools import wechat_warning
-import execjs
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-from dtcookie_pool import *
-
-from untils.cookie_pool import PageCookiePool
-import copy
-
-class Details(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details","item.site":"合肥市人民政府"},sort={"date":-1},limit=1)
-            for item in data_lsit:
-                request_params = item.get("request_params")
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-                if item.get("proxies"):
-
-                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"),deal_detail=item.get("deal_detail"),
-                                          callback=eval(item.get("parse")), base_info=item, files_info=item.get("files"),
-                                          down_mid=item.get("down_mid"), **request_params)
-                else:
-                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files=item.get("files"),
-                                          deal_detail=item.get("deal_detail"),down_mid=item.get("down_mid"), files_info=item.get("files"),
-                                          callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
-
-
-                self.to_db.delete(self.db_name,item)
-            break
-
-
-
-    def detail_get(self,request,response):
-        '''处理html格式的返回结果'''
-        if response.code in (request.down_mid.get("code")):
-            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
-            down_mid = request.down_mid
-            cookie_pool_class = down_mid.get("cookie_pool")
-            cookie_pool = eval(cookie_pool_class)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-
-        list_item.contenthtml = html
-        if request.files_info:
-            files_info = request.files_info
-            files = response.xpath(files_info.get("list_xpath"))
-            if request.files_info:
-                files_info = request.files_info
-                files = response.xpath(files_info.get("list_xpath"))
-                if len(files) > 0:
-                    attachments = {}
-                    for index, info in enumerate(files):
-                        file_url = info.xpath(files_info.get("url_xpath")).extract_first()
-                        file_name = info.xpath(files_info.get("name_xpath")).extract_first()
-                        if files_info.get("host"):
-                            file_url = urljoin(files_info.get("host"), file_url)
-                        if not files_info.get("file_type"):
-                            file_type = file_url.split("?")[0].split(".")[-1].lower()
-                        else:
-                            file_type = files_info.get("file_type")
-                        if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
-                            attachment = AttachmentDownloader().fetch_attachment(
-                                file_name=file_name, file_type=file_type, download_url=file_url,
-                                enable_proxy=False)
-                            attachments[len(attachments) + 1] = attachment
-                    if len(attachments) == 0:
-                        pass
-                    else:
-                        list_item.projectinfo = {"attachment": attachments}
-        yield list_item
-
-    def detail_json(self,request,response):
-        '''处理json串及其他格式的返回结果'''
-        if response.code in (request.down_mid.get("code")):
-            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
-            down_mid = request.down_mid
-            cookie_pool_class = down_mid.get("cookie_pool")
-            cookie_pool = eval(cookie_pool_class)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        exec(request.deal_detail)
-
-        list_item.contenthtml = html
-        yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        if response is None:
-            code = 0
-        else:
-            code = response.status_code
-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
-        if 200 <= code < 300:
-            err = 'analysis'
-        elif 300 <= code < 400:
-            err = 'download'
-        elif 400 <= code < 500:
-            err = 'download'
-        elif 500 <= code:
-            err = "servers"
-        else:
-            err = "timeout"
-        mgp = MgpListItem()
-        mgp.code = code
-        mgp.error = err
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key, items[key])
-        mgp.failed += 1
-        if mgp.pri is None:
-            mgp.pri = 0
-
-        if mgp.pri > 5:
-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info = f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **爬虫等级:** {mgp.pri}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-
-    def end_callback(self):
-        print("爬虫结束")
-    def download_midware(self, request):
-        headers = {
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-            "Accept-Encoding": "gzip, deflate, br",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "max-age=0",
-            "Connection": "keep-alive",
-            "Host": "www.hefei.gov.cn",
-            "Upgrade-Insecure-Requests": "1",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
-        }
-        down_mid = request.down_mid
-        cookie_pool_class = down_mid.get("cookie_pool")
-        cookie_pool = eval(cookie_pool_class)
-        request.cookies = cookie_pool.get_cookie()
-        request.headers=headers
-        return request
-
-
-if __name__ == "__main__":
-    Details(redis_key="magp:details1").start()

+ 0 - 134
NoteWork/details/detail_ztlbw.py

@@ -1,134 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:
----------
-@author: 马国鹏
-"""
-
-import feapder
-from feapder.utils.log import Log
-from feapder.utils.tools import wechat_warning
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-from login_pool.zglbw import ZglbwPool
-from untils.attachment import AttachmentDownloader
-
-Log().info("")
-
-
-class FirefoxDetails(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name, {"parser_name": "details_ztlbw", "item.spidercode": "a_ztlbsww_jzxtp"},
-                                        sort={"date": -1}, limit=1)
-            print(data_lsit)
-            for item in data_lsit:
-                url = item.get("parse_url")
-                url = "https://eproport.crecgec.com/#/notice/notice-detail?projectId=1484412339522916354&tenantId=1&indexnumber=0"
-                cookie = ZglbwPool(table_userbase='zglbw', redis_key='zglbw')
-                cookie = cookie.get_cookie().cookie
-                yield feapder.Request(url=url, item=item.get("item"),
-                                      callback=self.detail_get, base_info=item, render=True,
-                                      render_time=3, proxies=False, cookies=cookie)
-                self.to_db.delete(self.db_name, item)
-            break
-
-    def detail_get(self, request, response):
-        items = request.item
-        # print(items)
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key, items[key])
-        html = ''
-        xpath_list = ['//div[@class="ant-col ant-col-xs-6 ant-col-sm-6 ant-col-lg-12"][1]',
-                      '//div[@class="luban-bid-details ant-row ng-star-inserted"][2]',
-                      '//div[@class="login ng-star-inserted"]']
-        for xpath in xpath_list:
-            # import pdb
-            # pdb.set_trace()
-            html_one = response.xpath(xpath).extract_first()
-            if html_one is not None:
-                html += '\n'  # 标书详细内容
-                html += html_one  # 拼接html
-        print(html)
-        list_item.contenthtml = html
-        files_list = response.xpath("//iframe/@src").extract_first()
-        file_url = files_list.split("file=")[-1]
-        file_url = file_url.replace("%3A", ":").replace("%2F", "/").replace("%3F", "?").replace("%3D", "=")
-        attachments = {}
-        file_name = list_item.title
-
-        attachment = AttachmentDownloader().fetch_attachment(
-            file_name=file_name, file_type='pdf', download_url=file_url,
-            enable_proxy=False)
-        attachments["0"] = attachment
-        list_item.projectinfo = {"attachments": attachments}
-        yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        if response is None:
-            code = 0
-        code = response.status_code
-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
-        if 200 <= code < 300:
-            err = 'analysis'
-        elif 300 <= code < 400:
-            err = 'download'
-        elif 400 <= code < 500:
-            err = 'download'
-        elif 500 <= code:
-            err = "servers"
-        else:
-            err = "timeout"
-        mgp = MgpListItem()
-        mgp.code = code
-        mgp.error = err
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key, items[key])
-        mgp.failed += 1
-        if mgp.pri is None:
-            mgp.pri = 0
-
-        if mgp.pri > 5:
-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info = f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **爬虫等级:** {mgp.pri}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-    def end_callback(self):
-        print("爬虫结束")
-        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
-    # def download_midware(self, request):
-    #     request.proxies = self.prox_pool.get()
-    #     return request
-
-
-if __name__ == "__main__":
-    FirefoxDetails(redis_key="magp:details:ztlbw").start()

+ 0 - 1082
NoteWork/details/details

@@ -1,1082 +0,0 @@
-Thread-5|2022-01-28 17:06:38,101|parser_control.py|run|line:56|DEBUG| parser 等待任务...
-Details|2022-01-28 17:06:38,102|scheduler.py|<lambda>|line:112|INFO| 
-********** feapder begin **********
-Details|2022-01-28 17:06:38,103|scheduler.py|__add_task|line:215|INFO| 检查到有待做任务 8 条,不重下发新任务,将接着上回异常终止处继续抓取
-Thread-4|2022-01-28 17:06:47,221|collector.py|__input_data|line:108|INFO| 重置丢失任务完毕,共8条
-Thread-5|2022-01-28 17:06:48,223|request.py|get_response|line:305|DEBUG| 
-                -------------- Details.detail_get request for ----------------
-                url  = http://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c/
-                method = GET
-                body = {'files': {'list_xpath': '//div[@id="fjxz"]/p[@class="mar-L30 fjwz"]/a|//div[@id="result"]//u/a|//div[@class="TRS_Editor"]//p/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp'], 'file_type': 'doxc', 'url_key': 'http', 'host': 'http://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'}}
-                
-Thread-5|2022-01-28 17:06:48,270|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
-Thread-5|2022-01-28 17:06:48,270|parser_control.py|deal_requests|line:249|ERROR| 
-                            -------------- Details.detail_get error -------------
-                            error          HTTPConnectionPool(host='cz.fjzfcg.gov.cn', port=80): Max retries exceeded with url: /3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000259309BDAF0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
-                            response       None
-                            deal request   {'base_info': b'\x80\x04\x95S\x05\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03_id\x94'
-              b'\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93\x94)\x81\x94'
-              b'C\x0ca\xf3a\xae\x95G\xb8\xb7\xd1\r\xc04\x94b\x8c\x05parse\x94'
-              b'\x8c\x0fself.detail_get\x94\x8c\x04item\x94}\x94(\x8c\x05ti'
-              b'tle\x94\x8c]\xe4\xb9\xa1\xe9\x95\x87\xe6\x95\xac\xe8'
-              b'\x80\x81\xe9\x99\xa2\xe5\xba\x8a\xe4\xbd\x8d\xe4'
-              b'\xbd\xbf\xe7\x94\xa8\xe7\x8e\x87\xe8\xbe\xbe\xe6'
-              b'\xa0\x87\xe5\x8e\xbf\xef\xbc\x88\xe5\xb8\x82\xe3'
-              b'\x80\x81\xe5\x8c\xba\xef\xbc\x89\xe7\xac\xac\xe4'
-              b'\xb8\x89\xe6\x96\xb9\xe8\xaf\x84\xe4\xbc\xb0\xe9'
-              b'\x87\x87\xe8\xb4\xad\xe9\xa1\xb9\xe7\x9b\xae\xe9'
-              b'\x87\x87\xe8\xb4\xad\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x0bpublis'
-              b'htime\x94\x8c\x132019-07-17 16:14:02\x94\x8c\nspidercode'
-              b'\x94\x8c\x0efj_fjsmzt_tzgg\x94\x8c\x04site\x94\x8c\x12\xe7'
-              b'\xa6\x8f\xe5\xbb\xba\xe7\x9c\x81\xe6\xb0\x91\xe6'
-              b'\x94\xbf\xe5\x8e\x85\x94\x8c\x07channel\x94\x8c\x0c\xe9\x80'
-              b'\x9a\xe7\x9f\xa5\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x04are'
-              b'a\x94\x8c\x06\xe7\xa6\x8f\xe5\xbb\xba\x94\x8c\x04cit'
-              b'y\x94\x8c\x00\x94\x8c\x0bcompetehref\x94N\x8c\x04href\x94\x8c'
-              b'fhttp://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672'
-              b'964c633ce/7c36067afe5b449ea66bae09d11cf45c/\x94\x8c\x0bpublis'
-              b'hdept\x94h\x18\x8c\tiscompete\x94\x88\x8c\x04type\x94'
-              b'h\x18\x8c\x01T\x94\x8c\x07bidding\x94\x8c\x10l_np_publishti'
-              b'me\x94h\x18\x8c\ncomeintime\x94h\x18\x8c\x08sendflag\x94\x8c'
-              b'\x05false\x94\x8c\x02_d\x94\x8c\ncomeintime\x94\x8c\x0bconte'
-              b'nthtml\x94h\x18\x8c\x06detail\x94h\x18\x8c\x0bprojectinfo\x94Nu'
-              b'\x8c\x0bparser_name\x94\x8c\x07details\x94\x8c\x04date\x94\x8c'
-              b'\x132022-01-28 11:23:26\x94\x8c\x0bdeal_detail\x94]'
-              b'\x94(\x8c\x17//div[@class="xl_main"]\x94\x8c\x19//div[@class="'
-              b'big-box-B"]\x94e\x8c\x0bcreate_time\x94N\x8c\tparse_url\x94'
-              b'\x8cfhttp://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a63367'
-              b'2964c633ce/7c36067afe5b449ea66bae09d11cf45c/\x94\x8c\x0ereque'
-              b'st_params\x94}\x94\x8c\x06failed\x94K\x04\x8c\x06author\x94'
-              b'\x8c\x07details\x94\x8c\x05ex_js\x94h\x18\x8c\tex_python\x94'
-              b'N\x8c\x03pri\x94K\x01\x8c\x07proxies\x94\x89\x8c\x05files\x94'
-              b'}\x94(\x8c\nlist_xpath\x94\x8ce//div[@id="fjxz"]/p[@class="ma'
-              b'r-L30 fjwz"]/a|//div[@id="result"]//u/a|//div[@class="TRS_Ed'
-              b'itor"]//p/a\x94\x8c\turl_xpath\x94\x8c\x07./@href\x94\x8c\nname'
-              b'_xpath\x94\x8c\x08./text()\x94\x8c\nfiles_type\x94]'
-              b'\x94(\x8c\x03zip\x94\x8c\x04doxc\x94\x8c\x03ftp\x94e\x8c\tfile'
-              b'_type\x94\x8c\x04doxc\x94\x8c\x07url_key\x94\x8c\x04htt'
-              b'p\x94\x8c\x04host\x94\x8cehttp://cz.fjzfcg.gov.cn/3500/notice/1'
-              b'c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11c'
-              b'f45c\x94u\x8c\x05error\x94N\x8c\x04code\x94K\x00u.',
- 'callback': 'detail_get',
- 'deal_detail': b'\x80\x04\x95;\x00\x00\x00\x00\x00\x00\x00]\x94(\x8c\x17//di'
-                b'v[@class="xl_main"]\x94\x8c\x19//div[@class="big-box-B"]\x94'
-                b'e.',
- 'error_msg': 'requests.exceptions.ConnectionError: '
-              "HTTPConnectionPool(host='cz.fjzfcg.gov.cn', port=80): Max "
-              'retries exceeded with url: '
-              '/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c/ '
-              '(Caused by '
-              "NewConnectionError('<urllib3.connection.HTTPConnection object "
-              'at 0x0000016835E6B850>: Failed to establish a new connection: '
-              "[Errno 11001] getaddrinfo failed'))",
- 'files': {'file_type': 'doxc',
-           'files_type': ['zip', 'doxc', 'ftp'],
-           'host': 'http://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c',
-           'list_xpath': '//div[@id="fjxz"]/p[@class="mar-L30 '
-                         'fjwz"]/a|//div[@id="result"]//u/a|//div[@class="TRS_Editor"]//p/a',
-           'name_xpath': './text()',
-           'url_key': 'http',
-           'url_xpath': './@href'},
- 'filter_repeat': False,
- 'item': b'\x80\x04\x95$\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
-         b'e\x94\x8c]\xe4\xb9\xa1\xe9\x95\x87\xe6\x95\xac\xe8\x80\x81'
-         b'\xe9\x99\xa2\xe5\xba\x8a\xe4\xbd\x8d\xe4\xbd\xbf\xe7\x94\xa8\xe7'
-         b'\x8e\x87\xe8\xbe\xbe\xe6\xa0\x87\xe5\x8e\xbf\xef\xbc\x88\xe5\xb8'
-         b'\x82\xe3\x80\x81\xe5\x8c\xba\xef\xbc\x89\xe7\xac\xac\xe4\xb8\x89'
-         b'\xe6\x96\xb9\xe8\xaf\x84\xe4\xbc\xb0\xe9\x87\x87\xe8\xb4\xad\xe9'
-         b'\xa1\xb9\xe7\x9b\xae\xe9\x87\x87\xe8\xb4\xad\xe5\x85\xac\xe5\x91'
-         b'\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132019-07-17 16:14:02\x94\x8c\n'
-         b'spidercode\x94\x8c\x0efj_fjsmzt_tzgg\x94\x8c\x04site\x94\x8c'
-         b'\x12\xe7\xa6\x8f\xe5\xbb\xba\xe7\x9c\x81\xe6\xb0\x91\xe6\x94\xbf'
-         b'\xe5\x8e\x85\x94\x8c\x07channel\x94\x8c\x0c\xe9\x80\x9a\xe7'
-         b'\x9f\xa5\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x04area\x94\x8c\x06\xe7\xa6'
-         b'\x8f\xe5\xbb\xba\x94\x8c\x04city\x94\x8c\x00\x94\x8c\x0bcompetehref'
-         b'\x94N\x8c\x04href\x94\x8cfhttp://cz.fjzfcg.gov.cn/3500/notice/1c4f9'
-         b'44709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c'
-         b'/\x94\x8c\x0bpublishdept\x94h\x0e\x8c\tiscompete\x94\x88\x8c\x04typ'
-         b'e\x94h\x0e\x8c\x01T\x94\x8c\x07bidding\x94\x8c\x10l_np_publishtime'
-         b'\x94h\x0e\x8c\ncomeintime\x94h\x0e\x8c\x08sendflag\x94\x8c\x05false'
-         b'\x94\x8c\x02_d\x94\x8c\ncomeintime\x94\x8c\x0bcontenthtml'
-         b'\x94h\x0e\x8c\x06detail\x94h\x0e\x8c\x0bprojectinfo\x94Nu.',
- 'parser_name': 'Details',
- 'proxies': False,
- 'response': 'None',
- 'retry_times': 2,
- 'url': 'http://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c/'}
-                            
-Thread-5|2022-01-28 17:06:48,294|request.py|get_response|line:305|DEBUG| 
-                -------------- Details.detail_get request for ----------------
-                url  = http://cz.fjzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8/
-                method = GET
-                body = {'files': {'list_xpath': '//div[@id="fjxz"]/p[@class="mar-L30 fjwz"]/a|//div[@id="result"]//u/a|//div[@class="TRS_Editor"]//p/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp'], 'file_type': 'doxc', 'url_key': 'http', 'host': 'http://cz.fjzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36'}}
-                
-Thread-5|2022-01-28 17:06:48,333|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
-Thread-5|2022-01-28 17:06:48,334|parser_control.py|deal_requests|line:249|ERROR| 
-                            -------------- Details.detail_get error -------------
-                            error          HTTPConnectionPool(host='cz.fjzfcg.gov.cn', port=80): Max retries exceeded with url: /3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000259309ECA30>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
-                            response       None
-                            deal request   {'base_info': b'\x80\x04\x955\x05\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03_id\x94'
-              b'\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93\x94)\x81\x94'
-              b'C\x0ca\xf3a\xae\x95G\xb8\xb7\xd1\r\xc0B\x94b\x8c\x05parse\x94'
-              b'\x8c\x0fself.detail_get\x94\x8c\x04item\x94}\x94(\x8c\x05ti'
-              b'tle\x94\x8c?\xe7\xa6\x8f\xe5\xbb\xba\xe7\x9c\x81\xe5'
-              b'\x85\xbb\xe8\x80\x81\xe6\x9c\x8d\xe5\x8a\xa1\xe7'
-              b'\xbb\xbc\xe5\x90\x88\xe4\xbf\xa1\xe6\x81\xaf\xe5'
-              b'\xb9\xb3\xe5\x8f\xb0\xe9\x87\x87\xe8\xb4\xad\xe9'
-              b'\xa1\xb9\xe7\x9b\xae\xe6\x8b\x9b\xe6\xa0\x87\xe5'
-              b'\x85\xac\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132019-0'
-              b'5-22 16:01:08\x94\x8c\nspidercode\x94\x8c\x0efj_fjsmzt_tzgg\x94'
-              b'\x8c\x04site\x94\x8c\x12\xe7\xa6\x8f\xe5\xbb\xba\xe7'
-              b'\x9c\x81\xe6\xb0\x91\xe6\x94\xbf\xe5\x8e\x85\x94\x8c\x07channe'
-              b'l\x94\x8c\x0c\xe9\x80\x9a\xe7\x9f\xa5\xe5\x85\xac\xe5\x91\x8a'
-              b'\x94\x8c\x04area\x94\x8c\x06\xe7\xa6\x8f\xe5\xbb\xba'
-              b'\x94\x8c\x04city\x94\x8c\x00\x94\x8c\x0bcompetehref'
-              b'\x94N\x8c\x04href\x94\x8cfhttp://cz.fjzfcg.gov.cn/3500/notice/d'
-              b'2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da'
-              b'31a8/\x94\x8c\x0bpublishdept\x94h\x18\x8c\tiscompet'
-              b'e\x94\x88\x8c\x04type\x94h\x18\x8c\x01T\x94\x8c\x07biddin'
-              b'g\x94\x8c\x10l_np_publishtime\x94h\x18\x8c\ncomeintime\x94'
-              b'h\x18\x8c\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\n'
-              b'comeintime\x94\x8c\x0bcontenthtml\x94h\x18\x8c\x06detail\x94'
-              b'h\x18\x8c\x0bprojectinfo\x94Nu\x8c\x0bparser_name\x94\x8c\x07de'
-              b'tails\x94\x8c\x04date\x94\x8c\x132022-01-28 11:23:26\x94\x8c'
-              b'\x0bdeal_detail\x94]\x94(\x8c\x17//div[@class="xl_main"'
-              b']\x94\x8c\x19//div[@class="big-box-B"]\x94e\x8c\x0bcreate_time'
-              b'\x94N\x8c\tparse_url\x94\x8cfhttp://cz.fjzfcg.gov.cn/3500/not'
-              b'ice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877'
-              b'162da31a8/\x94\x8c\x0erequest_params\x94}\x94\x8c\x06fail'
-              b'ed\x94K\x04\x8c\x06author\x94\x8c\x07details\x94\x8c\x05ex'
-              b'_js\x94h\x18\x8c\tex_python\x94N\x8c\x03pri\x94K\x01\x8c\x07pro'
-              b'xies\x94\x89\x8c\x05files\x94}\x94(\x8c\nlist_xpath\x94\x8ce'
-              b'//div[@id="fjxz"]/p[@class="mar-L30 fjwz"]/a|//div[@id="resu'
-              b'lt"]//u/a|//div[@class="TRS_Editor"]//p/a\x94\x8c\turl_xpat'
-              b'h\x94\x8c\x07./@href\x94\x8c\nname_xpath\x94\x8c\x08./tex'
-              b't()\x94\x8c\nfiles_type\x94]\x94(\x8c\x03zip\x94\x8c\x04doxc'
-              b'\x94\x8c\x03ftp\x94e\x8c\tfile_type\x94\x8c\x04doxc\x94\x8c'
-              b'\x07url_key\x94\x8c\x04http\x94\x8c\x04host\x94\x8cehttp://cz.f'
-              b'jzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91'
-              b'255ff3752c4bc48770877162da31a8\x94u\x8c\x05error\x94N\x8c\x04c'
-              b'ode\x94K\x00u.',
- 'callback': 'detail_get',
- 'deal_detail': b'\x80\x04\x95;\x00\x00\x00\x00\x00\x00\x00]\x94(\x8c\x17//di'
-                b'v[@class="xl_main"]\x94\x8c\x19//div[@class="big-box-B"]\x94'
-                b'e.',
- 'error_msg': 'requests.exceptions.ConnectionError: '
-              "HTTPConnectionPool(host='cz.fjzfcg.gov.cn', port=80): Max "
-              'retries exceeded with url: '
-              '/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8/ '
-              '(Caused by '
-              "NewConnectionError('<urllib3.connection.HTTPConnection object "
-              'at 0x0000016835E877F0>: Failed to establish a new connection: '
-              "[Errno 11001] getaddrinfo failed'))",
- 'files': {'file_type': 'doxc',
-           'files_type': ['zip', 'doxc', 'ftp'],
-           'host': 'http://cz.fjzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8',
-           'list_xpath': '//div[@id="fjxz"]/p[@class="mar-L30 '
-                         'fjwz"]/a|//div[@id="result"]//u/a|//div[@class="TRS_Editor"]//p/a',
-           'name_xpath': './text()',
-           'url_key': 'http',
-           'url_xpath': './@href'},
- 'filter_repeat': False,
- 'item': b'\x80\x04\x95\x06\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
-         b'e\x94\x8c?\xe7\xa6\x8f\xe5\xbb\xba\xe7\x9c\x81\xe5\x85\xbb'
-         b'\xe8\x80\x81\xe6\x9c\x8d\xe5\x8a\xa1\xe7\xbb\xbc\xe5\x90\x88\xe4'
-         b'\xbf\xa1\xe6\x81\xaf\xe5\xb9\xb3\xe5\x8f\xb0\xe9\x87\x87\xe8\xb4'
-         b'\xad\xe9\xa1\xb9\xe7\x9b\xae\xe6\x8b\x9b\xe6\xa0\x87\xe5\x85\xac'
-         b'\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132019-05-22 16:01'
-         b':08\x94\x8c\nspidercode\x94\x8c\x0efj_fjsmzt_tzgg\x94\x8c\x04site'
-         b'\x94\x8c\x12\xe7\xa6\x8f\xe5\xbb\xba\xe7\x9c\x81\xe6\xb0\x91\xe6'
-         b'\x94\xbf\xe5\x8e\x85\x94\x8c\x07channel\x94\x8c\x0c\xe9\x80'
-         b'\x9a\xe7\x9f\xa5\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x04area\x94\x8c\x06'
-         b'\xe7\xa6\x8f\xe5\xbb\xba\x94\x8c\x04city\x94\x8c\x00\x94\x8c\x0bc'
-         b'ompetehref\x94N\x8c\x04href\x94\x8cfhttp://cz.fjzfcg.gov.cn/3500/no'
-         b'tice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31'
-         b'a8/\x94\x8c\x0bpublishdept\x94h\x0e\x8c\tiscompete\x94\x88\x8c\x04t'
-         b'ype\x94h\x0e\x8c\x01T\x94\x8c\x07bidding\x94\x8c\x10l_np_publishti'
-         b'me\x94h\x0e\x8c\ncomeintime\x94h\x0e\x8c\x08sendflag\x94\x8c\x05fal'
-         b'se\x94\x8c\x02_d\x94\x8c\ncomeintime\x94\x8c\x0bcontenthtml\x94h'
-         b'\x0e\x8c\x06detail\x94h\x0e\x8c\x0bprojectinfo\x94Nu.',
- 'parser_name': 'Details',
- 'proxies': False,
- 'response': 'None',
- 'retry_times': 2,
- 'url': 'http://cz.fjzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8/'}
-                            
-Thread-5|2022-01-28 17:06:48,380|request.py|get_response|line:305|DEBUG| 
-                -------------- Details.detail_get request for ----------------
-                url  = https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7e3a9c7e946b44017e9f51af707454.html
-                method = GET
-                body = {'files': {'list_xpath': '//div[@class="info-article in active"]//div/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp', 'pdf'], 'url_key': 'http'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36'}}
-                
-Thread-5|2022-01-28 17:06:48,394|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
-Thread-5|2022-01-28 17:06:48,395|parser_control.py|deal_requests|line:249|ERROR| 
-                            -------------- Details.detail_get error -------------
-                            error          dictionary update sequence element #0 has length 1; 2 is required
-                            response       None
-                            deal request   {'base_info': b'\x80\x04\x95\x84\x04\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03'
-              b'_id\x94\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93'
-              b'\x94)\x81\x94C\x0ca\xf3\xa1a\x81\xdbV\xa5\x9f\xf9hq\x94b'
-              b'\x8c\x05parse\x94\x8c\x0fself.detail_get\x94\x8c\x04item'
-              b'\x94}\x94(\x8c\x05title\x94\x8cQ\xe5\xb9\xbf\xe4\xb8\x9c'
-              b'\xe8\xbd\xbb\xe5\xb7\xa5\xe8\x81\x8c\xe4\xb8\x9a'
-              b'\xe6\x8a\x80\xe6\x9c\xaf\xe5\xad\xa6\xe9\x99\xa2'
-              b'\xe6\x96\xb0\xe8\x83\xbd\xe6\xba\x90\xe6\xb1\xbd'
-              b'\xe8\xbd\xa6\xe6\xa3\x80\xe6\xb5\x8b\xe5\xae\x9e'
-              b'\xe8\xae\xad\xe8\xae\xbe\xe5\xa4\x87\xe8\xb4\xad'
-              b'\xe7\xbd\xae\xe6\x8b\x9b\xe6\xa0\x87\xe5\x85\xac'
-              b'\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132022-01-28 15:09'
-              b':43\x94\x8c\nspidercode\x94\x8c\x13gd_gdszfcgwxwz_cggg\x94\x8c'
-              b'\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c'
-              b'\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4'
-              b'\xad\xe7\xbd\x91\x94\x8c\x07channel\x94\x8c\x0c\xe9\x87\x87'
-              b'\xe8\xb4\xad\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x04area'
-              b'\x94\x8c\x06\xe5\xb9\xbf\xe4\xb8\x9c\x94\x8c\x04city'
-              b'\x94\x8c\x00\x94\x8c\x0bcompetehref\x94N\x8c\x04href\x94\x8c`'
-              b'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8'
-              b'a7e3a9c7e946b44017e9f51af707454.html\x94\x8c\x0bpublishde'
-              b'pt\x94h\x18\x8c\tiscompete\x94\x88\x8c\x04type\x94h\x18\x8c'
-              b'\x01T\x94\x8c\x07bidding\x94\x8c\x10l_np_publishtime\x94'
-              b'h\x18\x8c\ncomeintime\x94h\x18\x8c\x08sendflag\x94\x8c\x05fa'
-              b'lse\x94\x8c\x02_d\x94\x8c\ncomeintime\x94\x8c\x0bcontenth'
-              b'tml\x94h\x18\x8c\x06detail\x94h\x18\x8c\x0bprojectinfo\x94N'
-              b'u\x8c\x0bparser_name\x94\x8c\x07details\x94\x8c\x04date\x94'
-              b'\x8c\x132022-01-28 15:55:12\x94\x8c\x0bdeal_detail\x94'
-              b']\x94\x8c&//div[@class="info-article in active"]\x94a\x8c\x0bcr'
-              b'eate_time\x94N\x8c\tparse_url\x94\x8c`https://gdgpo.czt.gd.go'
-              b'v.cn/freecms/site/gd/ggxx/info/2022/8a7e3a9c7e946b44017e9f51'
-              b'af707454.html\x94\x8c\x0erequest_params\x94}\x94\x8c\x06faile'
-              b'd\x94K\x02\x8c\x06author\x94\x8c\x07details\x94\x8c\x05ex_'
-              b'js\x94h\x18\x8c\tex_python\x94N\x8c\x03pri\x94K\x01\x8c\x07prox'
-              b'ies\x94\x89\x8c\x05files\x94}\x94(\x8c\nlist_xpath\x94\x8c-/'
-              b'/div[@class="info-article in active"]//div/a\x94\x8c\turl_xpath'
-              b'\x94\x8c\x07./@href\x94\x8c\nname_xpath\x94\x8c\x08./text'
-              b'()\x94\x8c\nfiles_type\x94]\x94(\x8c\x03zip\x94\x8c\x04doxc\x94'
-              b'\x8c\x03ftp\x94\x8c\x03pdf\x94e\x8c\x07url_key\x94\x8c\x04htt'
-              b'p\x94u\x8c\x05error\x94N\x8c\x04code\x94K\x00u.',
- 'callback': 'detail_get',
- 'deal_detail': b'\x80\x04\x95-\x00\x00\x00\x00\x00\x00\x00]\x94\x8c&//div[@cl'
-                b'ass="info-article in active"]\x94a.',
- 'error_msg': 'ValueError: dictionary update sequence element #0 has length 1; '
-              '2 is required',
- 'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
-           'list_xpath': '//div[@class="info-article in active"]//div/a',
-           'name_xpath': './text()',
-           'url_key': 'http',
-           'url_xpath': './@href'},
- 'filter_repeat': False,
- 'item': b'\x80\x04\x95\x1d\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
-         b'e\x94\x8cQ\xe5\xb9\xbf\xe4\xb8\x9c\xe8\xbd\xbb\xe5\xb7\xa5'
-         b'\xe8\x81\x8c\xe4\xb8\x9a\xe6\x8a\x80\xe6\x9c\xaf\xe5\xad\xa6\xe9'
-         b'\x99\xa2\xe6\x96\xb0\xe8\x83\xbd\xe6\xba\x90\xe6\xb1\xbd\xe8\xbd'
-         b'\xa6\xe6\xa3\x80\xe6\xb5\x8b\xe5\xae\x9e\xe8\xae\xad\xe8\xae\xbe'
-         b'\xe5\xa4\x87\xe8\xb4\xad\xe7\xbd\xae\xe6\x8b\x9b\xe6\xa0\x87\xe5'
-         b'\x85\xac\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132022-01-28'
-         b' 15:09:43\x94\x8c\nspidercode\x94\x8c\x13gd_gdszfcgwxwz_cggg'
-         b'\x94\x8c\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c\x81\xe6'
-         b'\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4\xad\xe7\xbd\x91\x94\x8c'
-         b'\x07channel\x94\x8c\x0c\xe9\x87\x87\xe8\xb4\xad\xe5\x85\xac'
-         b'\xe5\x91\x8a\x94\x8c\x04area\x94\x8c\x06\xe5\xb9\xbf\xe4\xb8\x9c\x94'
-         b'\x8c\x04city\x94\x8c\x00\x94\x8c\x0bcompetehref\x94N\x8c\x04href\x94'
-         b'\x8c`https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7'
-         b'e3a9c7e946b44017e9f51af707454.html\x94\x8c\x0bpublishdept'
-         b'\x94h\x0e\x8c\tiscompete\x94\x88\x8c\x04type\x94h\x0e\x8c\x01T'
-         b'\x94\x8c\x07bidding\x94\x8c\x10l_np_publishtime\x94h\x0e\x8c\ncomein'
-         b'time\x94h\x0e\x8c\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c'
-         b'\ncomeintime\x94\x8c\x0bcontenthtml\x94h\x0e\x8c\x06detail'
-         b'\x94h\x0e\x8c\x0bprojectinfo\x94Nu.',
- 'parser_name': 'Details',
- 'proxies': False,
- 'response': 'None',
- 'retry_times': 2,
- 'url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7e3a9c7e946b44017e9f51af707454.html'}
-                            
-Thread-5|2022-01-28 17:06:48,446|request.py|get_response|line:305|DEBUG| 
-                -------------- Details.detail_get request for ----------------
-                url  = https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7edc7d7e9e62ca017e9f00529a7d80.html
-                method = GET
-                body = {'files': {'list_xpath': '//div[@class="info-article in active"]//div/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp', 'pdf'], 'url_key': 'http'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36'}}
-                
-Thread-5|2022-01-28 17:06:48,458|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
-Thread-5|2022-01-28 17:06:48,459|parser_control.py|deal_requests|line:249|ERROR| 
-                            -------------- Details.detail_get error -------------
-                            error          dictionary update sequence element #0 has length 1; 2 is required
-                            response       None
-                            deal request   {'base_info': b'\x80\x04\x95\x92\x04\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03'
-              b'_id\x94\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93'
-              b'\x94)\x81\x94C\x0ca\xf3\xa1c\x81\xdbV\xa5\x9f\xf9hv\x94b'
-              b'\x8c\x05parse\x94\x8c\x0fself.detail_get\x94\x8c\x04item'
-              b'\x94}\x94(\x8c\x05title\x94\x8cW\xe5\xb9\xbf\xe4\xb8\x9c'
-              b'\xe7\x9c\x81\xe8\x8b\xb1\xe5\xbe\xb7\xe7\x9b\x91'
-              b'\xe7\x8b\xb1\xe8\x81\x8c\xe5\xb7\xa5\xe9\xa5\xad'
-              b'\xe5\xa0\x82\xe8\xbf\x90\xe8\x90\xa5\xe6\x9c\x8d'
-              b'\xe5\x8a\xa1\xe9\x87\x87\xe8\xb4\xad\xe9\xa1\xb9\xe7\x9b\xae('
-              b'GZSW21201FG4176A)\xe7\xbb\x93\xe6\x9e\x9c\xe5\x85\xac\xe5\x91'
-              b'\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132022-01-28 14:51:5'
-              b'6\x94\x8c\nspidercode\x94\x8c\x15gd_gdszfcgwxwz_zbcjgg\x94\x8c'
-              b'\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c'
-              b'\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4'
-              b'\xad\xe7\xbd\x91\x94\x8c\x07channel\x94\x8c\x12\xe4\xb8\xad'
-              b'\xe6\xa0\x87\xe6\x88\x90\xe4\xba\xa4\xe5\x85\xac'
-              b'\xe5\x91\x8a\x94\x8c\x04area\x94\x8c\x06\xe5\xb9\xbf'
-              b'\xe4\xb8\x9c\x94\x8c\x04city\x94\x8c\x00\x94\x8c\x0bcompeteh'
-              b'ref\x94N\x8c\x04href\x94\x8c`https://gdgpo.czt.gd.gov.cn/freecm'
-              b's/site/gd/ggxx/info/2022/8a7edc7d7e9e62ca017e9f00529a7d80.ht'
-              b'ml\x94\x8c\x0bpublishdept\x94h\x18\x8c\tiscompete\x94\x88'
-              b'\x8c\x04type\x94h\x18\x8c\x01T\x94\x8c\x07bidding\x94\x8c'
-              b'\x10l_np_publishtime\x94h\x18\x8c\ncomeintime\x94h\x18\x8c'
-              b'\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\ncomeint'
-              b'ime\x94\x8c\x0bcontenthtml\x94h\x18\x8c\x06detail\x94h\x18\x8c'
-              b'\x0bprojectinfo\x94Nu\x8c\x0bparser_name\x94\x8c\x07detai'
-              b'ls\x94\x8c\x04date\x94\x8c\x132022-01-28 15:55:14\x94\x8c\x0bde'
-              b'al_detail\x94]\x94\x8c&//div[@class="info-article in active"]'
-              b'\x94a\x8c\x0bcreate_time\x94N\x8c\tparse_url\x94\x8c`https://g'
-              b'dgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7edc7d7e'
-              b'9e62ca017e9f00529a7d80.html\x94\x8c\x0erequest_params'
-              b'\x94}\x94\x8c\x06failed\x94K\x02\x8c\x06author\x94\x8c\x07det'
-              b'ails\x94\x8c\x05ex_js\x94h\x18\x8c\tex_python\x94N\x8c\x03pr'
-              b'i\x94K\x01\x8c\x07proxies\x94\x89\x8c\x05files\x94}\x94(\x8c\n'
-              b'list_xpath\x94\x8c-//div[@class="info-article in active"]//div'
-              b'/a\x94\x8c\turl_xpath\x94\x8c\x07./@href\x94\x8c\nname_xpat'
-              b'h\x94\x8c\x08./text()\x94\x8c\nfiles_type\x94]\x94(\x8c\x03z'
-              b'ip\x94\x8c\x04doxc\x94\x8c\x03ftp\x94\x8c\x03pdf\x94e\x8c'
-              b'\x07url_key\x94\x8c\x04http\x94u\x8c\x05error\x94N\x8c\x04code'
-              b'\x94K\x00u.',
- 'callback': 'detail_get',
- 'deal_detail': b'\x80\x04\x95-\x00\x00\x00\x00\x00\x00\x00]\x94\x8c&//div[@cl'
-                b'ass="info-article in active"]\x94a.',
- 'error_msg': 'ValueError: dictionary update sequence element #0 has length 1; '
-              '2 is required',
- 'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
-           'list_xpath': '//div[@class="info-article in active"]//div/a',
-           'name_xpath': './text()',
-           'url_key': 'http',
-           'url_xpath': './@href'},
- 'filter_repeat': False,
- 'item': b'\x80\x04\x95+\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
-         b'e\x94\x8cW\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c\x81\xe8\x8b\xb1'
-         b'\xe5\xbe\xb7\xe7\x9b\x91\xe7\x8b\xb1\xe8\x81\x8c\xe5\xb7\xa5\xe9'
-         b'\xa5\xad\xe5\xa0\x82\xe8\xbf\x90\xe8\x90\xa5\xe6\x9c\x8d\xe5\x8a'
-         b'\xa1\xe9\x87\x87\xe8\xb4\xad\xe9\xa1\xb9\xe7\x9b\xae(GZSW21201FG417'
-         b'6A)\xe7\xbb\x93\xe6\x9e\x9c\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x0bpu'
-         b'blishtime\x94\x8c\x132022-01-28 14:51:56\x94\x8c\nspidercode'
-         b'\x94\x8c\x15gd_gdszfcgwxwz_zbcjgg\x94\x8c\x04site\x94'
-         b'\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c\x81\xe6\x94\xbf\xe5\xba'
-         b'\x9c\xe9\x87\x87\xe8\xb4\xad\xe7\xbd\x91\x94\x8c\x07channel'
-         b'\x94\x8c\x12\xe4\xb8\xad\xe6\xa0\x87\xe6\x88\x90\xe4\xba\xa4\xe5'
-         b'\x85\xac\xe5\x91\x8a\x94\x8c\x04area\x94\x8c\x06\xe5\xb9\xbf\xe4\xb8'
-         b'\x9c\x94\x8c\x04city\x94\x8c\x00\x94\x8c\x0bcompetehref\x94N\x8c'
-         b'\x04href\x94\x8c`https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx'
-         b'/info/2022/8a7edc7d7e9e62ca017e9f00529a7d80.html\x94\x8c\x0bpubli'
-         b'shdept\x94h\x0e\x8c\tiscompete\x94\x88\x8c\x04type\x94h\x0e\x8c'
-         b'\x01T\x94\x8c\x07bidding\x94\x8c\x10l_np_publishtime\x94h\x0e\x8c\n'
-         b'comeintime\x94h\x0e\x8c\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d'
-         b'\x94\x8c\ncomeintime\x94\x8c\x0bcontenthtml\x94h\x0e\x8c\x06deta'
-         b'il\x94h\x0e\x8c\x0bprojectinfo\x94Nu.',
- 'parser_name': 'Details',
- 'proxies': False,
- 'response': 'None',
- 'retry_times': 2,
- 'url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7edc7d7e9e62ca017e9f00529a7d80.html'}
-                            
-Thread-5|2022-01-28 17:06:48,484|request.py|get_response|line:305|DEBUG| 
-                -------------- Details.detail_get request for ----------------
-                url  = https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7ee7dd7e9e4962017e9f56e40058a5.html
-                method = GET
-                body = {'files': {'list_xpath': '//div[@class="info-article in active"]//div/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp', 'pdf'], 'url_key': 'http'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36'}}
-                
-Thread-5|2022-01-28 17:06:48,503|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
-Thread-5|2022-01-28 17:06:48,504|parser_control.py|deal_requests|line:249|ERROR| 
-                            -------------- Details.detail_get error -------------
-                            error          dictionary update sequence element #0 has length 1; 2 is required
-                            response       None
-                            deal request   {'base_info': b'\x80\x04\x95\x8f\x04\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03'
-              b'_id\x94\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93'
-              b'\x94)\x81\x94C\x0ca\xf3\xa1c\x81\xdbV\xa5\x9f\xf9hw\x94b'
-              b'\x8c\x05parse\x94\x8c\x0fself.detail_get\x94\x8c\x04item'
-              b'\x94}\x94(\x8c\x05title\x94\x8cT\xe4\xbd\x9b\xe5\xb1\xb1'
-              b'\xe5\xb8\x82\xe9\xa1\xba\xe5\xbe\xb7\xe5\x8c\xba'
-              b'\xe4\xba\xba\xe6\xb0\x91\xe6\xb3\x95\xe9\x99\xa2'
-              b'\xe4\xbf\xa1\xe6\x81\xaf\xe5\x8c\x96\xe8\xbd\xaf'
-              b'\xe7\xa1\xac\xe4\xbb\xb6\xe8\xae\xbe\xe5\xa4\x87'
-              b'\xe7\xbb\xb4\xe6\x8a\xa4\xe6\x9c\x8d\xe5\x8a\xa1'
-              b'\xe9\xa1\xb9\xe7\x9b\xae\xe7\xbb\x93\xe6\x9e\x9c'
-              b'\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132'
-              b'022-01-28 14:37:36\x94\x8c\nspidercode\x94\x8c\x15gd_gdszfcg'
-              b'wxwz_zbcjgg\x94\x8c\x04site\x94\x8c\x18\xe5\xb9\xbf'
-              b'\xe4\xb8\x9c\xe7\x9c\x81\xe6\x94\xbf\xe5\xba\x9c'
-              b'\xe9\x87\x87\xe8\xb4\xad\xe7\xbd\x91\x94\x8c\x07channel\x94'
-              b'\x8c\x12\xe4\xb8\xad\xe6\xa0\x87\xe6\x88\x90\xe4'
-              b'\xba\xa4\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x04area\x94'
-              b'\x8c\x06\xe5\xb9\xbf\xe4\xb8\x9c\x94\x8c\x04city\x94'
-              b'\x8c\x00\x94\x8c\x0bcompetehref\x94N\x8c\x04href\x94\x8c`https'
-              b'://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7ee7'
-              b'dd7e9e4962017e9f56e40058a5.html\x94\x8c\x0bpublishdept\x94h\x18'
-              b'\x8c\tiscompete\x94\x88\x8c\x04type\x94h\x18\x8c\x01'
-              b'T\x94\x8c\x07bidding\x94\x8c\x10l_np_publishtime\x94h'
-              b'\x18\x8c\ncomeintime\x94h\x18\x8c\x08sendflag\x94\x8c\x05fal'
-              b'se\x94\x8c\x02_d\x94\x8c\ncomeintime\x94\x8c\x0bcontentht'
-              b'ml\x94h\x18\x8c\x06detail\x94h\x18\x8c\x0bprojectinfo\x94Nu'
-              b'\x8c\x0bparser_name\x94\x8c\x07details\x94\x8c\x04date\x94\x8c'
-              b'\x132022-01-28 15:55:14\x94\x8c\x0bdeal_detail\x94]\x94\x8c&/'
-              b'/div[@class="info-article in active"]\x94a\x8c\x0bcreate_time'
-              b'\x94N\x8c\tparse_url\x94\x8c`https://gdgpo.czt.gd.gov.cn/free'
-              b'cms/site/gd/ggxx/info/2022/8a7ee7dd7e9e4962017e9f56e40058a5.'
-              b'html\x94\x8c\x0erequest_params\x94}\x94\x8c\x06failed'
-              b'\x94K\x02\x8c\x06author\x94\x8c\x07details\x94\x8c\x05ex_j'
-              b's\x94h\x18\x8c\tex_python\x94N\x8c\x03pri\x94K\x01\x8c\x07proxi'
-              b'es\x94\x89\x8c\x05files\x94}\x94(\x8c\nlist_xpath\x94\x8c-//'
-              b'div[@class="info-article in active"]//div/a\x94\x8c\turl_xp'
-              b'ath\x94\x8c\x07./@href\x94\x8c\nname_xpath\x94\x8c\x08./text('
-              b')\x94\x8c\nfiles_type\x94]\x94(\x8c\x03zip\x94\x8c\x04do'
-              b'xc\x94\x8c\x03ftp\x94\x8c\x03pdf\x94e\x8c\x07url_ke'
-              b'y\x94\x8c\x04http\x94u\x8c\x05error\x94N\x8c\x04code\x94K\x00'
-              b'u.',
- 'callback': 'detail_get',
- 'deal_detail': b'\x80\x04\x95-\x00\x00\x00\x00\x00\x00\x00]\x94\x8c&//div[@cl'
-                b'ass="info-article in active"]\x94a.',
- 'error_msg': 'ValueError: dictionary update sequence element #0 has length 1; '
-              '2 is required',
- 'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
-           'list_xpath': '//div[@class="info-article in active"]//div/a',
-           'name_xpath': './text()',
-           'url_key': 'http',
-           'url_xpath': './@href'},
- 'filter_repeat': False,
- 'item': b'\x80\x04\x95(\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
-         b'e\x94\x8cT\xe4\xbd\x9b\xe5\xb1\xb1\xe5\xb8\x82\xe9\xa1\xba'
-         b'\xe5\xbe\xb7\xe5\x8c\xba\xe4\xba\xba\xe6\xb0\x91\xe6\xb3\x95\xe9'
-         b'\x99\xa2\xe4\xbf\xa1\xe6\x81\xaf\xe5\x8c\x96\xe8\xbd\xaf\xe7\xa1'
-         b'\xac\xe4\xbb\xb6\xe8\xae\xbe\xe5\xa4\x87\xe7\xbb\xb4\xe6\x8a\xa4'
-         b'\xe6\x9c\x8d\xe5\x8a\xa1\xe9\xa1\xb9\xe7\x9b\xae\xe7\xbb\x93\xe6'
-         b'\x9e\x9c\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c'
-         b'\x132022-01-28 14:37:36\x94\x8c\nspidercode\x94\x8c\x15gd_gdszfcgwx'
-         b'wz_zbcjgg\x94\x8c\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8'
-         b'\x9c\xe7\x9c\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4\xad'
-         b'\xe7\xbd\x91\x94\x8c\x07channel\x94\x8c\x12\xe4\xb8\xad\xe6'
-         b'\xa0\x87\xe6\x88\x90\xe4\xba\xa4\xe5\x85\xac\xe5\x91\x8a\x94\x8c'
-         b'\x04area\x94\x8c\x06\xe5\xb9\xbf\xe4\xb8\x9c\x94\x8c\x04cit'
-         b'y\x94\x8c\x00\x94\x8c\x0bcompetehref\x94N\x8c\x04href\x94\x8c`htt'
-         b'ps://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7ee7dd7e9e'
-         b'4962017e9f56e40058a5.html\x94\x8c\x0bpublishdept\x94h\x0e\x8c\tisco'
-         b'mpete\x94\x88\x8c\x04type\x94h\x0e\x8c\x01T\x94\x8c\x07biddin'
-         b'g\x94\x8c\x10l_np_publishtime\x94h\x0e\x8c\ncomeintime\x94'
-         b'h\x0e\x8c\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\ncome'
-         b'intime\x94\x8c\x0bcontenthtml\x94h\x0e\x8c\x06detail\x94'
-         b'h\x0e\x8c\x0bprojectinfo\x94Nu.',
- 'parser_name': 'Details',
- 'proxies': False,
- 'response': 'None',
- 'retry_times': 2,
- 'url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7ee7dd7e9e4962017e9f56e40058a5.html'}
-                            
-Thread-5|2022-01-28 17:06:48,552|request.py|get_response|line:305|DEBUG| 
-                -------------- Details.detail_get request for ----------------
-                url  = https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7efa517e9032e5017e9b37b0c50534.html
-                method = GET
-                body = {'files': {'list_xpath': '//div[@class="info-article in active"]//div/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp', 'pdf'], 'url_key': 'http'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36'}}
-                
-Thread-5|2022-01-28 17:06:48,564|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
-Thread-5|2022-01-28 17:06:48,565|parser_control.py|deal_requests|line:249|ERROR| 
-                            -------------- Details.detail_get error -------------
-                            error          dictionary update sequence element #0 has length 1; 2 is required
-                            response       None
-                            deal request   {'base_info': b'\x80\x04\x95\x8d\x04\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03'
-              b'_id\x94\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93'
-              b'\x94)\x81\x94C\x0ca\xf2\x95\xb9{\xdc<\xbf\xf2)V\xe6\x94b'
-              b'\x8c\x05parse\x94\x8c\x0fself.detail_get\x94\x8c\x04item'
-              b'\x94}\x94(\x8c\x05title\x94\x8cZ\xe5\xb9\xbf\xe4\xb8\x9c'
-              b'\xe7\x9c\x81\xe4\xba\xba\xe5\x8a\x9b\xe8\xb5\x84'
-              b'\xe6\xba\x90\xe5\xb8\x82\xe5\x9c\xba\xe8\xae\xbe'
-              b'\xe6\x96\xbd\xe8\xae\xbe\xe5\xa4\x87\xe8\xb4\xad'
-              b'\xe7\xbd\xae\xe9\x9b\x86\xe6\x88\x90\xe5\x8f\x8a'
-              b'\xe5\xb1\x95\xe9\x99\x88\xe5\xb8\x83\xe7\xbd\xae'
-              b'\xe6\x9c\x8d\xe5\x8a\xa1\xe9\xa1\xb9\xe7\x9b\xae'
-              b'\xe6\x8b\x9b\xe6\xa0\x87\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x0bp'
-              b'ublishtime\x94\x8c\x132022-01-27 19:14:45\x94\x8c\nspidercod'
-              b'e\x94\x8c\x13gd_gdszfcgwxwz_cggg\x94\x8c\x04site\x94\x8c'
-              b'\x18\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c\x81\xe6\x94'
-              b'\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4\xad\xe7\xbd'
-              b'\x91\x94\x8c\x07channel\x94\x8c\x0c\xe9\x87\x87\xe8\xb4\xad'
-              b'\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x04area\x94\x8c\x06'
-              b'\xe5\xb9\xbf\xe4\xb8\x9c\x94\x8c\x04city\x94\x8c\x00'
-              b'\x94\x8c\x0bcompetehref\x94N\x8c\x04href\x94\x8c`https://gdg'
-              b'po.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7efa517e90'
-              b'32e5017e9b37b0c50534.html\x94\x8c\x0bpublishdept\x94h\x18\x8c\t'
-              b'iscompete\x94\x88\x8c\x04type\x94h\x18\x8c\x01T\x94\x8c\x07bi'
-              b'dding\x94\x8c\x10l_np_publishtime\x94h\x18\x8c\ncomeintime\x94'
-              b'h\x18\x8c\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\n'
-              b'comeintime\x94\x8c\x0bcontenthtml\x94h\x18\x8c\x06detail\x94'
-              b'h\x18\x8c\x0bprojectinfo\x94Nu\x8c\x0bparser_name\x94\x8c\x07de'
-              b'tails\x94\x8c\x04date\x94\x8c\x132022-01-27 20:53:12\x94\x8c'
-              b'\x0bdeal_detail\x94]\x94\x8c&//div[@class="info-article in a'
-              b'ctive"]\x94a\x8c\x0bcreate_time\x94N\x8c\tparse_url\x94\x8c`ht'
-              b'tps://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7'
-              b'efa517e9032e5017e9b37b0c50534.html\x94\x8c\x0erequest_par'
-              b'ams\x94}\x94\x8c\x06failed\x94K\x13\x8c\x06author\x94\x8c\x07'
-              b'details\x94\x8c\x05ex_js\x94h\x18\x8c\tex_python\x94N\x8c'
-              b'\x03pri\x94K\x01\x8c\x07proxies\x94\x89\x8c\x05files\x94}\x94'
-              b'(\x8c\nlist_xpath\x94\x8c-//div[@class="info-article in active'
-              b'"]//div/a\x94\x8c\turl_xpath\x94\x8c\x07./@href\x94\x8c\nname_x'
-              b'path\x94\x8c\x08./text()\x94\x8c\nfiles_type\x94]\x94('
-              b'\x8c\x03zip\x94\x8c\x04doxc\x94\x8c\x03ftp\x94\x8c\x03pdf'
-              b'\x94e\x8c\x07url_key\x94\x8c\x04http\x94u\x8c\x05error\x94'
-              b'N\x8c\x04code\x94K\x00u.',
- 'callback': 'detail_get',
- 'deal_detail': b'\x80\x04\x95-\x00\x00\x00\x00\x00\x00\x00]\x94\x8c&//div[@cl'
-                b'ass="info-article in active"]\x94a.',
- 'error_msg': 'ValueError: dictionary update sequence element #0 has length 1; '
-              '2 is required',
- 'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
-           'list_xpath': '//div[@class="info-article in active"]//div/a',
-           'name_xpath': './text()',
-           'url_key': 'http',
-           'url_xpath': './@href'},
- 'filter_repeat': False,
- 'item': b'\x80\x04\x95&\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
-         b'e\x94\x8cZ\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c\x81\xe4\xba\xba'
-         b'\xe5\x8a\x9b\xe8\xb5\x84\xe6\xba\x90\xe5\xb8\x82\xe5\x9c\xba\xe8'
-         b'\xae\xbe\xe6\x96\xbd\xe8\xae\xbe\xe5\xa4\x87\xe8\xb4\xad\xe7\xbd'
-         b'\xae\xe9\x9b\x86\xe6\x88\x90\xe5\x8f\x8a\xe5\xb1\x95\xe9\x99\x88'
-         b'\xe5\xb8\x83\xe7\xbd\xae\xe6\x9c\x8d\xe5\x8a\xa1\xe9\xa1\xb9\xe7'
-         b'\x9b\xae\xe6\x8b\x9b\xe6\xa0\x87\xe5\x85\xac\xe5\x91\x8a\x94\x8c'
-         b'\x0bpublishtime\x94\x8c\x132022-01-27 19:14:45\x94\x8c\nspiderc'
-         b'ode\x94\x8c\x13gd_gdszfcgwxwz_cggg\x94\x8c\x04site\x94\x8c\x18\xe5'
-         b'\xb9\xbf\xe4\xb8\x9c\xe7\x9c\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87'
-         b'\x87\xe8\xb4\xad\xe7\xbd\x91\x94\x8c\x07channel\x94\x8c\x0c'
-         b'\xe9\x87\x87\xe8\xb4\xad\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x04area\x94'
-         b'\x8c\x06\xe5\xb9\xbf\xe4\xb8\x9c\x94\x8c\x04city\x94\x8c\x00\x94\x8c'
-         b'\x0bcompetehref\x94N\x8c\x04href\x94\x8c`https://gdgpo.czt.gd.gov.'
-         b'cn/freecms/site/gd/ggxx/info/2022/8a7efa517e9032e5017e9b37b0c50534.h'
-         b'tml\x94\x8c\x0bpublishdept\x94h\x0e\x8c\tiscompete\x94\x88\x8c\x04t'
-         b'ype\x94h\x0e\x8c\x01T\x94\x8c\x07bidding\x94\x8c\x10l_np_publishti'
-         b'me\x94h\x0e\x8c\ncomeintime\x94h\x0e\x8c\x08sendflag\x94\x8c\x05fal'
-         b'se\x94\x8c\x02_d\x94\x8c\ncomeintime\x94\x8c\x0bcontenthtml\x94h'
-         b'\x0e\x8c\x06detail\x94h\x0e\x8c\x0bprojectinfo\x94Nu.',
- 'parser_name': 'Details',
- 'proxies': False,
- 'response': 'None',
- 'retry_times': 2,
- 'url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7efa517e9032e5017e9b37b0c50534.html'}
-                            
-Thread-5|2022-01-28 17:06:48,615|request.py|get_response|line:305|DEBUG| 
-                -------------- Details.detail_get request for ----------------
-                url  = https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/9b5da8cc-1545-438c-8ad6-ccdd8bd71b10.html
-                method = GET
-                body = {'files': {'list_xpath': '//div[@class="info-article in active"]//div/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp', 'pdf'], 'url_key': 'http'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'}}
-                
-Thread-5|2022-01-28 17:06:48,628|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
-Thread-5|2022-01-28 17:06:48,628|parser_control.py|deal_requests|line:249|ERROR| 
-                            -------------- Details.detail_get error -------------
-                            error          dictionary update sequence element #0 has length 1; 2 is required
-                            response       None
-                            deal request   {'base_info': b'\x80\x04\x95\x86\x04\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03'
-              b'_id\x94\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93'
-              b'\x94)\x81\x94C\x0ca\xf3\xa1\\\x81\xdbV\xa5\x9f\xf9hU\x94b'
-              b'\x8c\x05parse\x94\x8c\x0fself.detail_get\x94\x8c\x04item'
-              b'\x94}\x94(\x8c\x05title\x94\x8cE\xe9\x9f\xb6\xe5\x85\xb3'
-              b'\xe5\xb8\x82\xe6\xad\xa6\xe6\xb1\x9f\xe5\x8c\xba'
-              b'\xe4\xba\xba\xe6\xb0\x91\xe6\xa3\x80\xe5\xaf\x9f'
-              b'\xe9\x99\xa2\xe7\x94\xb5\xe5\xad\x90\xe5\x8d\x96'
-              b'\xe5\x9c\xba\xe7\x9b\xb4\xe6\x8e\xa5\xe8\xae\xa2'
-              b'\xe8\xb4\xad\xe6\x88\x90\xe4\xba\xa4\xe5\x85\xac'
-              b'\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132022-01-28 15:48'
-              b':52\x94\x8c\nspidercode\x94\x8c\x13gd_gdszfcgwxwz_ysgg\x94\x8c'
-              b'\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c'
-              b'\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4'
-              b'\xad\xe7\xbd\x91\x94\x8c\x07channel\x94\x8c\x12\xe7\x94\xb5'
-              b'\xe5\xad\x90\xe5\x8d\x96\xe5\x9c\xba\xe4\xbf\xa1'
-              b'\xe6\x81\xaf\x94\x8c\x04area\x94\x8c\x06\xe5\xb9\xbf'
-              b'\xe4\xb8\x9c\x94\x8c\x04city\x94\x8c\x00\x94\x8c\x0bcompeteh'
-              b'ref\x94N\x8c\x04href\x94\x8cdhttps://gdgpo.czt.gd.gov.cn/freecm'
-              b's/site/gd/ggxx/info/2022/9b5da8cc-1545-438c-8ad6-ccdd8bd71b1'
-              b'0.html\x94\x8c\x0bpublishdept\x94h\x18\x8c\tiscompete\x94\x88'
-              b'\x8c\x04type\x94h\x18\x8c\x01T\x94\x8c\x07bidding\x94\x8c'
-              b'\x10l_np_publishtime\x94h\x18\x8c\ncomeintime\x94h\x18\x8c'
-              b'\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\ncomeint'
-              b'ime\x94\x8c\x0bcontenthtml\x94h\x18\x8c\x06detail\x94h\x18\x8c'
-              b'\x0bprojectinfo\x94Nu\x8c\x0bparser_name\x94\x8c\x07detai'
-              b'ls\x94\x8c\x04date\x94\x8c\x132022-01-28 15:55:07\x94\x8c\x0bde'
-              b'al_detail\x94]\x94\x8c&//div[@class="info-article in active"]'
-              b'\x94a\x8c\x0bcreate_time\x94N\x8c\tparse_url\x94\x8cdhttps://g'
-              b'dgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/9b5da8cc-1'
-              b'545-438c-8ad6-ccdd8bd71b10.html\x94\x8c\x0erequest_params'
-              b'\x94}\x94\x8c\x06failed\x94K\x02\x8c\x06author\x94\x8c\x07det'
-              b'ails\x94\x8c\x05ex_js\x94h\x18\x8c\tex_python\x94N\x8c\x03pr'
-              b'i\x94K\x01\x8c\x07proxies\x94\x89\x8c\x05files\x94}\x94(\x8c\n'
-              b'list_xpath\x94\x8c-//div[@class="info-article in active"]//div'
-              b'/a\x94\x8c\turl_xpath\x94\x8c\x07./@href\x94\x8c\nname_xpat'
-              b'h\x94\x8c\x08./text()\x94\x8c\nfiles_type\x94]\x94(\x8c\x03z'
-              b'ip\x94\x8c\x04doxc\x94\x8c\x03ftp\x94\x8c\x03pdf\x94e\x8c'
-              b'\x07url_key\x94\x8c\x04http\x94u\x8c\x05error\x94N\x8c\x04code'
-              b'\x94K\x00u.',
- 'callback': 'detail_get',
- 'deal_detail': b'\x80\x04\x95-\x00\x00\x00\x00\x00\x00\x00]\x94\x8c&//div[@cl'
-                b'ass="info-article in active"]\x94a.',
- 'error_msg': 'ValueError: dictionary update sequence element #0 has length 1; '
-              '2 is required',
- 'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
-           'list_xpath': '//div[@class="info-article in active"]//div/a',
-           'name_xpath': './text()',
-           'url_key': 'http',
-           'url_xpath': './@href'},
- 'filter_repeat': False,
- 'item': b'\x80\x04\x95\x1b\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
-         b'e\x94\x8cE\xe9\x9f\xb6\xe5\x85\xb3\xe5\xb8\x82\xe6\xad\xa6'
-         b'\xe6\xb1\x9f\xe5\x8c\xba\xe4\xba\xba\xe6\xb0\x91\xe6\xa3\x80\xe5'
-         b'\xaf\x9f\xe9\x99\xa2\xe7\x94\xb5\xe5\xad\x90\xe5\x8d\x96\xe5\x9c'
-         b'\xba\xe7\x9b\xb4\xe6\x8e\xa5\xe8\xae\xa2\xe8\xb4\xad\xe6\x88\x90'
-         b'\xe4\xba\xa4\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94'
-         b'\x8c\x132022-01-28 15:48:52\x94\x8c\nspidercode\x94\x8c\x13gd_gdsz'
-         b'fcgwxwz_ysgg\x94\x8c\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c'
-         b'\xe7\x9c\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4\xad\xe7'
-         b'\xbd\x91\x94\x8c\x07channel\x94\x8c\x12\xe7\x94\xb5\xe5\xad'
-         b'\x90\xe5\x8d\x96\xe5\x9c\xba\xe4\xbf\xa1\xe6\x81\xaf\x94\x8c\x04area'
-         b'\x94\x8c\x06\xe5\xb9\xbf\xe4\xb8\x9c\x94\x8c\x04city\x94\x8c\x00\x94'
-         b'\x8c\x0bcompetehref\x94N\x8c\x04href\x94\x8cdhttps://gdgpo.czt.gd'
-         b'.gov.cn/freecms/site/gd/ggxx/info/2022/9b5da8cc-1545-438c-8ad6-ccdd8'
-         b'bd71b10.html\x94\x8c\x0bpublishdept\x94h\x0e\x8c\tiscompete'
-         b'\x94\x88\x8c\x04type\x94h\x0e\x8c\x01T\x94\x8c\x07bidding'
-         b'\x94\x8c\x10l_np_publishtime\x94h\x0e\x8c\ncomeintime\x94h'
-         b'\x0e\x8c\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\ncomei'
-         b'ntime\x94\x8c\x0bcontenthtml\x94h\x0e\x8c\x06detail\x94h'
-         b'\x0e\x8c\x0bprojectinfo\x94Nu.',
- 'parser_name': 'Details',
- 'proxies': False,
- 'response': 'None',
- 'retry_times': 2,
- 'url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/9b5da8cc-1545-438c-8ad6-ccdd8bd71b10.html'}
-                            
-Thread-5|2022-01-28 17:06:48,677|request.py|get_response|line:305|DEBUG| 
-                -------------- Details.detail_get request for ----------------
-                url  = https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/ab73c655-102a-4923-b4cb-dadfdc82c913.html
-                method = GET
-                body = {'files': {'list_xpath': '//div[@class="info-article in active"]//div/a', 'url_xpath': './@href', 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp', 'pdf'], 'url_key': 'http'}, 'proxies': False, 'timeout': 22, 'stream': True, 'verify': False, 'headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36'}}
-                
-Thread-5|2022-01-28 17:06:48,691|tools.py|dumps_json|line:843|ERROR| Object of type bytes is not JSON serializable
-Thread-5|2022-01-28 17:06:48,691|parser_control.py|deal_requests|line:249|ERROR| 
-                            -------------- Details.detail_get error -------------
-                            error          dictionary update sequence element #0 has length 1; 2 is required
-                            response       None
-                            deal request   {'base_info': b'\x80\x04\x95\x86\x04\x00\x00\x00\x00\x00\x00}\x94(\x8c\x03'
-              b'_id\x94\x8c\rbson.objectid\x94\x8c\x08ObjectId\x94\x93'
-              b'\x94)\x81\x94C\x0ca\xf3Z\x94J\xa3\xe2Z\x12\xe9\t\x00\x94b'
-              b'\x8c\x05parse\x94\x8c\x0fself.detail_get\x94\x8c\x04item'
-              b'\x94}\x94(\x8c\x05title\x94\x8cE\xe5\xb9\xbf\xe4\xb8\x9c'
-              b'\xe7\x9c\x81\xe4\xbd\x9b\xe5\xb1\xb1\xe8\x88\xaa'
-              b'\xe9\x81\x93\xe4\xba\x8b\xe5\x8a\xa1\xe4\xb8\xad'
-              b'\xe5\xbf\x83\xe7\x94\xb5\xe5\xad\x90\xe5\x8d\x96'
-              b'\xe5\x9c\xba\xe7\x9b\xb4\xe6\x8e\xa5\xe8\xae\xa2'
-              b'\xe8\xb4\xad\xe6\x88\x90\xe4\xba\xa4\xe5\x85\xac'
-              b'\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94\x8c\x132022-01-28 10:00'
-              b':38\x94\x8c\nspidercode\x94\x8c\x13gd_gdszfcgwxwz_ysgg\x94\x8c'
-              b'\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c'
-              b'\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4'
-              b'\xad\xe7\xbd\x91\x94\x8c\x07channel\x94\x8c\x12\xe7\x94\xb5'
-              b'\xe5\xad\x90\xe5\x8d\x96\xe5\x9c\xba\xe4\xbf\xa1'
-              b'\xe6\x81\xaf\x94\x8c\x04area\x94\x8c\x06\xe5\xb9\xbf'
-              b'\xe4\xb8\x9c\x94\x8c\x04city\x94\x8c\x00\x94\x8c\x0bcompeteh'
-              b'ref\x94N\x8c\x04href\x94\x8cdhttps://gdgpo.czt.gd.gov.cn/freecm'
-              b's/site/gd/ggxx/info/2022/ab73c655-102a-4923-b4cb-dadfdc82c91'
-              b'3.html\x94\x8c\x0bpublishdept\x94h\x18\x8c\tiscompete\x94\x88'
-              b'\x8c\x04type\x94h\x18\x8c\x01T\x94\x8c\x07bidding\x94\x8c'
-              b'\x10l_np_publishtime\x94h\x18\x8c\ncomeintime\x94h\x18\x8c'
-              b'\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\ncomeint'
-              b'ime\x94\x8c\x0bcontenthtml\x94h\x18\x8c\x06detail\x94h\x18\x8c'
-              b'\x0bprojectinfo\x94Nu\x8c\x0bparser_name\x94\x8c\x07detai'
-              b'ls\x94\x8c\x04date\x94\x8c\x132022-01-28 10:53:07\x94\x8c\x0bde'
-              b'al_detail\x94]\x94\x8c&//div[@class="info-article in active"]'
-              b'\x94a\x8c\x0bcreate_time\x94N\x8c\tparse_url\x94\x8cdhttps://g'
-              b'dgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/ab73c655-1'
-              b'02a-4923-b4cb-dadfdc82c913.html\x94\x8c\x0erequest_params'
-              b'\x94}\x94\x8c\x06failed\x94K\x0b\x8c\x06author\x94\x8c\x07det'
-              b'ails\x94\x8c\x05ex_js\x94h\x18\x8c\tex_python\x94N\x8c\x03pr'
-              b'i\x94K\x01\x8c\x07proxies\x94\x89\x8c\x05files\x94}\x94(\x8c\n'
-              b'list_xpath\x94\x8c-//div[@class="info-article in active"]//div'
-              b'/a\x94\x8c\turl_xpath\x94\x8c\x07./@href\x94\x8c\nname_xpat'
-              b'h\x94\x8c\x08./text()\x94\x8c\nfiles_type\x94]\x94(\x8c\x03z'
-              b'ip\x94\x8c\x04doxc\x94\x8c\x03ftp\x94\x8c\x03pdf\x94e\x8c'
-              b'\x07url_key\x94\x8c\x04http\x94u\x8c\x05error\x94N\x8c\x04code'
-              b'\x94K\x00u.',
- 'callback': 'detail_get',
- 'deal_detail': b'\x80\x04\x95-\x00\x00\x00\x00\x00\x00\x00]\x94\x8c&//div[@cl'
-                b'ass="info-article in active"]\x94a.',
- 'error_msg': 'ValueError: dictionary update sequence element #0 has length 1; '
-              '2 is required',
- 'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
-           'list_xpath': '//div[@class="info-article in active"]//div/a',
-           'name_xpath': './text()',
-           'url_key': 'http',
-           'url_xpath': './@href'},
- 'filter_repeat': False,
- 'item': b'\x80\x04\x95\x1b\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05titl'
-         b'e\x94\x8cE\xe5\xb9\xbf\xe4\xb8\x9c\xe7\x9c\x81\xe4\xbd\x9b'
-         b'\xe5\xb1\xb1\xe8\x88\xaa\xe9\x81\x93\xe4\xba\x8b\xe5\x8a\xa1\xe4'
-         b'\xb8\xad\xe5\xbf\x83\xe7\x94\xb5\xe5\xad\x90\xe5\x8d\x96\xe5\x9c'
-         b'\xba\xe7\x9b\xb4\xe6\x8e\xa5\xe8\xae\xa2\xe8\xb4\xad\xe6\x88\x90'
-         b'\xe4\xba\xa4\xe5\x85\xac\xe5\x91\x8a\x94\x8c\x0bpublishtime\x94'
-         b'\x8c\x132022-01-28 10:00:38\x94\x8c\nspidercode\x94\x8c\x13gd_gdsz'
-         b'fcgwxwz_ysgg\x94\x8c\x04site\x94\x8c\x18\xe5\xb9\xbf\xe4\xb8\x9c'
-         b'\xe7\x9c\x81\xe6\x94\xbf\xe5\xba\x9c\xe9\x87\x87\xe8\xb4\xad\xe7'
-         b'\xbd\x91\x94\x8c\x07channel\x94\x8c\x12\xe7\x94\xb5\xe5\xad'
-         b'\x90\xe5\x8d\x96\xe5\x9c\xba\xe4\xbf\xa1\xe6\x81\xaf\x94\x8c\x04area'
-         b'\x94\x8c\x06\xe5\xb9\xbf\xe4\xb8\x9c\x94\x8c\x04city\x94\x8c\x00\x94'
-         b'\x8c\x0bcompetehref\x94N\x8c\x04href\x94\x8cdhttps://gdgpo.czt.gd'
-         b'.gov.cn/freecms/site/gd/ggxx/info/2022/ab73c655-102a-4923-b4cb-dadfd'
-         b'c82c913.html\x94\x8c\x0bpublishdept\x94h\x0e\x8c\tiscompete'
-         b'\x94\x88\x8c\x04type\x94h\x0e\x8c\x01T\x94\x8c\x07bidding'
-         b'\x94\x8c\x10l_np_publishtime\x94h\x0e\x8c\ncomeintime\x94h'
-         b'\x0e\x8c\x08sendflag\x94\x8c\x05false\x94\x8c\x02_d\x94\x8c\ncomei'
-         b'ntime\x94\x8c\x0bcontenthtml\x94h\x0e\x8c\x06detail\x94h'
-         b'\x0e\x8c\x0bprojectinfo\x94Nu.',
- 'parser_name': 'Details',
- 'proxies': False,
- 'response': 'None',
- 'retry_times': 2,
- 'url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/ab73c655-102a-4923-b4cb-dadfdc82c913.html'}
-                            
-Thread-5|2022-01-28 17:06:48,741|parser_control.py|run|line:56|DEBUG| parser 等待任务...
-Thread-3|2022-01-28 17:06:49,216|tools.py|dumps_json|line:843|ERROR| Object of type ObjectId is not JSON serializable
-Thread-3|2022-01-28 17:06:49,222|item_buffer.py|__add_item_to_db|line:300|DEBUG| 
-                -------------- item 批量入库 --------------
-                表名: mgp_list
-                datas: [{'_id': ObjectId('61f361ae9547b8b7d10dc034'),
-  'author': None,
-  'code': 0,
-  'create_time': None,
-  'date': '2022-01-28 11:23:26',
-  'deal_detail': ['//div[@class="xl_main"]', '//div[@class="big-box-B"]'],
-  'error': None,
-  'ex_js': '',
-  'ex_python': None,
-  'failed': 5,
-  'files': {'file_type': 'doxc',
-            'files_type': ['zip', 'doxc', 'ftp'],
-            'host': 'http://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c',
-            'list_xpath': '//div[@id="fjxz"]/p[@class="mar-L30 '
-                          'fjwz"]/a|//div[@id="result"]//u/a|//div[@class="TRS_Editor"]//p/a',
-            'name_xpath': './text()',
-            'url_key': 'http',
-            'url_xpath': './@href'},
-  'item': {'T': 'bidding',
-           '_d': 'comeintime',
-           'area': '福建',
-           'channel': '通知公告',
-           'city': '',
-           'comeintime': '',
-           'competehref': None,
-           'contenthtml': '',
-           'detail': '',
-           'href': 'http://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c/',
-           'iscompete': True,
-           'l_np_publishtime': '',
-           'projectinfo': None,
-           'publishdept': '',
-           'publishtime': '2019-07-17 16:14:02',
-           'sendflag': 'false',
-           'site': '福建省民政厅',
-           'spidercode': 'fj_fjsmzt_tzgg',
-           'title': '乡镇敬老院床位使用率达标县(市、区)第三方评估采购项目采购公告',
-           'type': ''},
-  'parse': 'self.detail_get',
-  'parse_url': 'http://cz.fjzfcg.gov.cn/3500/notice/1c4f944709d047a7a633672964c633ce/7c36067afe5b449ea66bae09d11cf45c/',
-  'parser_name': 'details',
-  'pri': 1,
-  'proxies': False,
-  'request_params': {}},
- {'_id': ObjectId('61f361ae9547b8b7d10dc042'),
-  'author': None,
-  'code': 0,
-  'create_time': None,
-  'date': '2022-01-28 11:23:26',
-  'deal_detail': ['//div[@class="xl_main"]', '//div[@class="big-box-B"]'],
-  'error': None,
-  'ex_js': '',
-  'ex_python': None,
-  'failed': 5,
-  'files': {'file_type': 'doxc',
-            'files_type': ['zip', 'doxc', 'ftp'],
-            'host': 'http://cz.fjzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8',
-            'list_xpath': '//div[@id="fjxz"]/p[@class="mar-L30 '
-                          'fjwz"]/a|//div[@id="result"]//u/a|//div[@class="TRS_Editor"]//p/a',
-            'name_xpath': './text()',
-            'url_key': 'http',
-            'url_xpath': './@href'},
-  'item': {'T': 'bidding',
-           '_d': 'comeintime',
-           'area': '福建',
-           'channel': '通知公告',
-           'city': '',
-           'comeintime': '',
-           'competehref': None,
-           'contenthtml': '',
-           'detail': '',
-           'href': 'http://cz.fjzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8/',
-           'iscompete': True,
-           'l_np_publishtime': '',
-           'projectinfo': None,
-           'publishdept': '',
-           'publishtime': '2019-05-22 16:01:08',
-           'sendflag': 'false',
-           'site': '福建省民政厅',
-           'spidercode': 'fj_fjsmzt_tzgg',
-           'title': '福建省养老服务综合信息平台采购项目招标公告',
-           'type': ''},
-  'parse': 'self.detail_get',
-  'parse_url': 'http://cz.fjzfcg.gov.cn/3500/notice/d2bad35854053876b45269f56e50dee2/91255ff3752c4bc48770877162da31a8/',
-  'parser_name': 'details',
-  'pri': 1,
-  'proxies': False,
-  'request_params': {}},
- {'_id': ObjectId('61f3a16181db56a59ff96871'),
-  'author': None,
-  'code': 0,
-  'create_time': None,
-  'date': '2022-01-28 15:55:12',
-  'deal_detail': ['//div[@class="info-article in active"]'],
-  'error': None,
-  'ex_js': '',
-  'ex_python': None,
-  'failed': 3,
-  'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
-            'list_xpath': '//div[@class="info-article in active"]//div/a',
-            'name_xpath': './text()',
-            'url_key': 'http',
-            'url_xpath': './@href'},
-  'item': {'T': 'bidding',
-           '_d': 'comeintime',
-           'area': '广东',
-           'channel': '采购公告',
-           'city': '',
-           'comeintime': '',
-           'competehref': None,
-           'contenthtml': '',
-           'detail': '',
-           'href': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7e3a9c7e946b44017e9f51af707454.html',
-           'iscompete': True,
-           'l_np_publishtime': '',
-           'projectinfo': None,
-           'publishdept': '',
-           'publishtime': '2022-01-28 15:09:43',
-           'sendflag': 'false',
-           'site': '广东省政府采购网',
-           'spidercode': 'gd_gdszfcgwxwz_cggg',
-           'title': '广东轻工职业技术学院新能源汽车检测实训设备购置招标公告',
-           'type': ''},
-  'parse': 'self.detail_get',
-  'parse_url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7e3a9c7e946b44017e9f51af707454.html',
-  'parser_name': 'details',
-  'pri': 1,
-  'proxies': False,
-  'request_params': {}},
- {'_id': ObjectId('61f3a16381db56a59ff96876'),
-  'author': None,
-  'code': 0,
-  'create_time': None,
-  'date': '2022-01-28 15:55:14',
-  'deal_detail': ['//div[@class="info-article in active"]'],
-  'error': None,
-  'ex_js': '',
-  'ex_python': None,
-  'failed': 3,
-  'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
-            'list_xpath': '//div[@class="info-article in active"]//div/a',
-            'name_xpath': './text()',
-            'url_key': 'http',
-            'url_xpath': './@href'},
-  'item': {'T': 'bidding',
-           '_d': 'comeintime',
-           'area': '广东',
-           'channel': '中标成交公告',
-           'city': '',
-           'comeintime': '',
-           'competehref': None,
-           'contenthtml': '',
-           'detail': '',
-           'href': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7edc7d7e9e62ca017e9f00529a7d80.html',
-           'iscompete': True,
-           'l_np_publishtime': '',
-           'projectinfo': None,
-           'publishdept': '',
-           'publishtime': '2022-01-28 14:51:56',
-           'sendflag': 'false',
-           'site': '广东省政府采购网',
-           'spidercode': 'gd_gdszfcgwxwz_zbcjgg',
-           'title': '广东省英德监狱职工饭堂运营服务采购项目(GZSW21201FG4176A)结果公告',
-           'type': ''},
-  'parse': 'self.detail_get',
-  'parse_url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7edc7d7e9e62ca017e9f00529a7d80.html',
-  'parser_name': 'details',
-  'pri': 1,
-  'proxies': False,
-  'request_params': {}},
- {'_id': ObjectId('61f3a16381db56a59ff96877'),
-  'author': None,
-  'code': 0,
-  'create_time': None,
-  'date': '2022-01-28 15:55:14',
-  'deal_detail': ['//div[@class="info-article in active"]'],
-  'error': None,
-  'ex_js': '',
-  'ex_python': None,
-  'failed': 3,
-  'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
-            'list_xpath': '//div[@class="info-article in active"]//div/a',
-            'name_xpath': './text()',
-            'url_key': 'http',
-            'url_xpath': './@href'},
-  'item': {'T': 'bidding',
-           '_d': 'comeintime',
-           'area': '广东',
-           'channel': '中标成交公告',
-           'city': '',
-           'comeintime': '',
-           'competehref': None,
-           'contenthtml': '',
-           'detail': '',
-           'href': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7ee7dd7e9e4962017e9f56e40058a5.html',
-           'iscompete': True,
-           'l_np_publishtime': '',
-           'projectinfo': None,
-           'publishdept': '',
-           'publishtime': '2022-01-28 14:37:36',
-           'sendflag': 'false',
-           'site': '广东省政府采购网',
-           'spidercode': 'gd_gdszfcgwxwz_zbcjgg',
-           'title': '佛山市顺德区人民法院信息化软硬件设备维护服务项目结果公告',
-           'type': ''},
-  'parse': 'self.detail_get',
-  'parse_url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7ee7dd7e9e4962017e9f56e40058a5.html',
-  'parser_name': 'details',
-  'pri': 1,
-  'proxies': False,
-  'request_params': {}},
- {'_id': ObjectId('61f295b97bdc3cbff22956e6'),
-  'author': None,
-  'code': 0,
-  'create_time': None,
-  'date': '2022-01-27 20:53:12',
-  'deal_detail': ['//div[@class="info-article in active"]'],
-  'error': None,
-  'ex_js': '',
-  'ex_python': None,
-  'failed': 20,
-  'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
-            'list_xpath': '//div[@class="info-article in active"]//div/a',
-            'name_xpath': './text()',
-            'url_key': 'http',
-            'url_xpath': './@href'},
-  'item': {'T': 'bidding',
-           '_d': 'comeintime',
-           'area': '广东',
-           'channel': '采购公告',
-           'city': '',
-           'comeintime': '',
-           'competehref': None,
-           'contenthtml': '',
-           'detail': '',
-           'href': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7efa517e9032e5017e9b37b0c50534.html',
-           'iscompete': True,
-           'l_np_publishtime': '',
-           'projectinfo': None,
-           'publishdept': '',
-           'publishtime': '2022-01-27 19:14:45',
-           'sendflag': 'false',
-           'site': '广东省政府采购网',
-           'spidercode': 'gd_gdszfcgwxwz_cggg',
-           'title': '广东省人力资源市场设施设备购置集成及展陈布置服务项目招标公告',
-           'type': ''},
-  'parse': 'self.detail_get',
-  'parse_url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/8a7efa517e9032e5017e9b37b0c50534.html',
-  'parser_name': 'details',
-  'pri': 1,
-  'proxies': False,
-  'request_params': {}},
- {'_id': ObjectId('61f3a15c81db56a59ff96855'),
-  'author': None,
-  'code': 0,
-  'create_time': None,
-  'date': '2022-01-28 15:55:07',
-  'deal_detail': ['//div[@class="info-article in active"]'],
-  'error': None,
-  'ex_js': '',
-  'ex_python': None,
-  'failed': 3,
-  'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
-            'list_xpath': '//div[@class="info-article in active"]//div/a',
-            'name_xpath': './text()',
-            'url_key': 'http',
-            'url_xpath': './@href'},
-  'item': {'T': 'bidding',
-           '_d': 'comeintime',
-           'area': '广东',
-           'channel': '电子卖场信息',
-           'city': '',
-           'comeintime': '',
-           'competehref': None,
-           'contenthtml': '',
-           'detail': '',
-           'href': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/9b5da8cc-1545-438c-8ad6-ccdd8bd71b10.html',
-           'iscompete': True,
-           'l_np_publishtime': '',
-           'projectinfo': None,
-           'publishdept': '',
-           'publishtime': '2022-01-28 15:48:52',
-           'sendflag': 'false',
-           'site': '广东省政府采购网',
-           'spidercode': 'gd_gdszfcgwxwz_ysgg',
-           'title': '韶关市武江区人民检察院电子卖场直接订购成交公告',
-           'type': ''},
-  'parse': 'self.detail_get',
-  'parse_url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/9b5da8cc-1545-438c-8ad6-ccdd8bd71b10.html',
-  'parser_name': 'details',
-  'pri': 1,
-  'proxies': False,
-  'request_params': {}},
- {'_id': ObjectId('61f35a944aa3e25a12e90900'),
-  'author': None,
-  'code': 0,
-  'create_time': None,
-  'date': '2022-01-28 10:53:07',
-  'deal_detail': ['//div[@class="info-article in active"]'],
-  'error': None,
-  'ex_js': '',
-  'ex_python': None,
-  'failed': 12,
-  'files': {'files_type': ['zip', 'doxc', 'ftp', 'pdf'],
-            'list_xpath': '//div[@class="info-article in active"]//div/a',
-            'name_xpath': './text()',
-            'url_key': 'http',
-            'url_xpath': './@href'},
-  'item': {'T': 'bidding',
-           '_d': 'comeintime',
-           'area': '广东',
-           'channel': '电子卖场信息',
-           'city': '',
-           'comeintime': '',
-           'competehref': None,
-           'contenthtml': '',
-           'detail': '',
-           'href': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/ab73c655-102a-4923-b4cb-dadfdc82c913.html',
-           'iscompete': True,
-           'l_np_publishtime': '',
-           'projectinfo': None,
-           'publishdept': '',
-           'publishtime': '2022-01-28 10:00:38',
-           'sendflag': 'false',
-           'site': '广东省政府采购网',
-           'spidercode': 'gd_gdszfcgwxwz_ysgg',
-           'title': '广东省佛山航道事务中心电子卖场直接订购成交公告',
-           'type': ''},
-  'parse': 'self.detail_get',
-  'parse_url': 'https://gdgpo.czt.gd.gov.cn/freecms/site/gd/ggxx/info/2022/ab73c655-102a-4923-b4cb-dadfdc82c913.html',
-  'parser_name': 'details',
-  'pri': 1,
-  'proxies': False,
-  'request_params': {}}]
-                    
-Thread-3|2022-01-28 17:06:49,723|mongo_pipeline.py|save_items|line:49|INFO| 共导出 8 条数据到 mgp_list,  新增 8条, 重复 0 条
-Details|2022-01-28 17:06:53,273|scheduler.py|<lambda>|line:117|INFO| 
-********** feapder end **********
-Details|2022-01-28 17:06:53,275|scheduler.py|spider_end|line:520|INFO| 《magp:details1》爬虫结束,耗时 3分20秒
-Details|2022-01-28 17:06:53,276|scheduler.py|delete_tables|line:444|INFO| 正在删除key magp:details1:z_spider_status

+ 0 - 170
NoteWork/details/details.py

@@ -1,170 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:
----------
-@author: 马国鹏
-"""
-import json
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import time
-from urllib.parse import urljoin
-
-import feapder
-from feapder.utils.tools import wechat_warning
-import execjs
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-from untils.attachment import AttachmentDownloader
-
-
-class Details(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details"},sort={"item.publishtime":-1},limit=50)
-            for item in data_lsit:
-                print(11111)
-                request_params = item.get("request_params")
-                if item.get("js"):
-                    eval(item.get("js"))
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-                if item.get("proxies"):
-
-                    yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files=item.get("files"),
-                                          deal_detail=item.get("deal_detail"),
-                                          callback=eval(item.get("parse")),base_info=item,**request_params)
-                else:
-                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files=item.get("files"),
-                                          deal_detail=item.get("deal_detail"),
-                                          callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
-
-                self.to_db.delete(self.db_name,item)
-            break
-
-    def detail_get(self,request,response):
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-
-        list_item.contenthtml = html
-        if request.files:
-            files_info = request.files
-            files =  response.xpath(files_info.get("list_xpath"))
-            if request.files_info:
-                files_info = request.files_info
-                files = response.xpath(files_info.get("list_xpath"))
-                if request.files_info:
-                    files_info = request.files_info
-                    files = response.xpath(files_info.get("list_xpath"))
-                    if len(files) > 0:
-                        attachments = {}
-                        for index, info in enumerate(files):
-                            file_url = info.xpath(files_info.get("url_xpath")).extract_first()
-                            file_name = info.xpath(files_info.get("name_xpath")).extract_first()
-                            if files_info.get("host"):
-                                file_url = urljoin(files_info.get("host"), file_url)
-                            if not files_info.get("file_type"):
-                                file_type = file_url.split("?")[0].split(".")[-1].lower()
-                            else:
-                                file_type = files_info.get("file_type")
-                            if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
-                                attachment = AttachmentDownloader().fetch_attachment(
-                                    file_name=file_name, file_type=file_type, download_url=file_url,
-                                    enable_proxy=False)
-                                attachments[len(attachments) + 1] = attachment
-                        if len(attachments) == 0:
-                            pass
-                        else:
-                            list_item.projectinfo = {"attachment": attachments}
-
-
-        yield list_item
-
-    def detail_json(self,request,response):
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        exec(request.deal_detail)
-
-        yield list_item
-    def detail_post(self,request,response):
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        exec(request.deal_detail)
-
-        yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        if response is None:
-            code = 0
-        else:
-            code = response.status_code
-        err_dic = {"200":"analysis","400":"download","500":"servers","300":"download"}
-        if 200<=code<300:
-            err = 'analysis'
-        elif 300<=code<400:
-            err = 'download'
-        elif 400<=code<500:
-            err = 'download'
-        elif 500<=code:
-            err = "servers"
-        else:
-            err = "timeout"
-        mgp = MgpListItem()
-        mgp.code=code
-        mgp.error=err
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key,items[key])
-        mgp.failed +=1
-        if mgp.pri is None:
-            mgp.pri = 0
-
-        if mgp.pri > 5:
-            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info= f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **爬虫等级:** {mgp.pri}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-    def end_callback(self):
-        print("爬虫结束")
-
-
-
-if __name__ == "__main__":
-    Details(redis_key="magp:details1").start()

+ 0 - 180
NoteWork/details/details_ces.py

@@ -1,180 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:
----------
-@author: 马国鹏
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import time
-from urllib.parse import urljoin
-
-import feapder
-from feapder.utils.tools import wechat_warning
-import execjs
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-from untils.attachment import AttachmentDownloader
-
-
-class Details(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details","item.spidercode":"a_szsjzsczhcxpt_zbxx"},sort={"item.publishtime":-1},limit=1)
-            for item in data_lsit:
-                print(item.get("item"))
-                request_params = item.get("request_params")
-                if item.get("js"):
-                    eval(item.get("js"))
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-                if item.get("proxies"):
-
-                    yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files=item.get("files"),
-                                          deal_detail=item.get("deal_detail"),
-                                          callback=eval(item.get("parse")),base_info=item)
-                else:
-                    # print(item.get("files"))
-                    files = {'list_xpath': '//div[@class="info-article in active"]//div/a', 'url_xpath': './@href',
-                     'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp', 'pdf',"ddf"], 'url_key': 'http'}
-                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files_info=files,
-                                          deal_detail=item.get("deal_detail"),
-                                          callback=eval(item.get("parse")), base_info=item,proxies=False)
-
-                self.to_db.delete(self.db_name,item)
-            break
-
-    def detail_get(self,request,response):
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-        list_item.contenthtml = html
-        if request.files_info:
-            files_info = request.files_info
-            files =  response.xpath(files_info.get("list_xpath"))
-            if len(files)>1:
-                attachments = {}
-                for index,info in enumerate(files):
-                    file_url = info.xpath(files_info.get("url_xpath")).extract_first()
-                    file_name = info.xpath(files_info.get("name_xpath")).extract_first()
-                    if files_info.get("host"):
-                        file_url = urljoin(files_info.get("host"), file_url)
-                    if not files_info.get("file_type"):
-                        file_type = file_url.split("?")[0].split(".")[-1].lower()
-                    else:
-                        file_type = files_info.get("file_type")
-                    if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
-                        attachment = AttachmentDownloader().fetch_attachment(
-                            file_name=file_name,file_type=file_type,download_url=file_url,
-                            enable_proxy=False)
-                        attachments[index] = attachment
-                list_item.projectinfo=attachments
-            else:
-                for info in files:
-                    file_url = info.xpath(files_info.get("url_xpath")).extract_first()
-                    file_name = info.xpath(files_info.get("name_xpath")).extract_first()
-                    if files_info.get("host"):
-                        file_url = urljoin(files_info.get("host"), file_url)
-                    if files_info.get("file_name"):
-                        file_name = files_info.get("file_name")
-                    else:
-                        file_name = file_name
-                    if files_info.get("file_type"):
-                        file_type = files_info.get("file_type")
-                    else:
-                        file_type = file_name.split("?")[0].split(".")[-1]
-                    if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
-                        attachment = AttachmentDownloader().fetch_attachment(
-                            file_name=file_name, file_type=file_type, download_url=file_url,
-                            enable_proxy=False)
-                        list_item.projectinfo = attachment
-
-        yield list_item
-
-    def detail_json(self,request,response):
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        exec(request.deal_detail)
-
-        yield list_item
-    def detail_post(self,request,response):
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        exec(request.deal_detail)
-
-        yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        if response is None:
-            code = 0
-        else:
-            code = response.status_code
-        err_dic = {"200":"analysis","400":"download","500":"servers","300":"download"}
-        if 200<=code<300:
-            err = 'analysis'
-        elif 300<=code<400:
-            err = 'download'
-        elif 400<=code<500:
-            err = 'download'
-        elif 500<=code:
-            err = "servers"
-        else:
-            err = "timeout"
-        mgp = MgpListItem()
-        mgp.code=code
-        mgp.error=err
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key,items[key])
-        mgp.failed +=1
-        if mgp.pri is None:
-            mgp.pri = 0
-
-        if mgp.pri > 5:
-            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info= f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **爬虫等级:** {mgp.pri}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-    def end_callback(self):
-        print("爬虫结束")
-
-
-
-if __name__ == "__main__":
-    Details(redis_key="magp:details1").start()

+ 0 - 165
NoteWork/details/details_cookie.py

@@ -1,165 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:  生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip
----------
-@author: 马国鹏
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from feapder.utils.tools import wechat_warning
-import execjs
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-
-from untils.cookie_pool import PageCookiePool
-import copy
-
-class Details(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
-            for item in data_lsit:
-                request_params = item.get("request_params")
-
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-
-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
-                                      deal_detail=item.get("deal_detail"),**request_params,
-                                      callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
-                self.to_db.delete(self.db_name,item)
-            break
-
-
-
-    def detail_get(self,request,response):
-        '''处理html格式的返回结果'''
-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
-            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        if response.code in (request.down_mid.get("code")):
-            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-
-        list_item.contenthtml = html
-        yield list_item
-
-    def detail_json(self,request,response):
-        '''处理json串及其他格式的返回结果'''
-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
-            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        if response.code in (request.down_mid.get("code")):
-            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.down_mid)
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        exec(request.deal_detail)
-
-        list_item.contenthtml = html
-        yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        if response is None:
-            code = 0
-        else:
-            code = response.status_code
-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
-        if 200 <= code < 300:
-            err = 'analysis'
-        elif 300 <= code < 400:
-            err = 'download'
-        elif 400 <= code < 500:
-            err = 'download'
-        elif 500 <= code:
-            err = "servers"
-        else:
-            err = "timeout"
-        mgp = MgpListItem()
-        mgp.code = code
-        mgp.error = err
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key, items[key])
-        mgp.failed += 1
-        if mgp.pri is None:
-            mgp.pri = 0
-
-        if mgp.pri > 5:
-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info = f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **爬虫等级:** {mgp.pri}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-
-    def end_callback(self):
-        print("爬虫结束")
-    def download_midware(self, request):
-        down_mid = request.down_mid
-        key = down_mid.get("key")
-        page_url = down_mid.get("page_url")
-        cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-        request.cookies = cookie_pool.get_cookie()
-        return request
-
-
-if __name__ == "__main__":
-    Details(redis_key="magp:details1").start()

+ 0 - 115
NoteWork/details/details_firefox.py

@@ -1,115 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:
----------
-@author: 马国鹏
-"""
-
-import feapder
-from feapder.utils.tools import wechat_warning
-import execjs
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-
-
-
-class FirefoxDetails(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_firefox"},sort={"date":-1})
-            print(data_lsit)
-            for item in data_lsit:
-                print(item)
-                request_params = item.get("request_params")
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-
-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
-                                      deal_detail=item.get("deal_detail"),**request_params,
-                                      callback=eval(item.get("parse")),base_info=item,render=True,
-                                      render_time=item.get("render_time"))
-                self.to_db.delete(self.db_name,item)
-            break
-
-    def detail_get(self,request,response):
-        print(response.text)
-        items = request.item
-        # print(items)
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-        list_item.contenthtml = html
-        yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        if response is None:
-            code = 0
-        code = response.status_code
-        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
-        if 200 <= code < 300:
-            err = 'analysis'
-        elif 300 <= code < 400:
-            err = 'download'
-        elif 400 <= code < 500:
-            err = 'download'
-        elif 500 <= code:
-            err = "servers"
-        else:
-            err = "timeout"
-        mgp = MgpListItem()
-        mgp.code = code
-        mgp.error = err
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key, items[key])
-        mgp.failed += 1
-        if mgp.pri is None:
-            mgp.pri = 0
-
-        if mgp.pri > 5:
-            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info = f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **爬虫等级:** {mgp.pri}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-
-    def end_callback(self):
-        print("爬虫结束")
-        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
-    # def download_midware(self, request):
-    #     request.proxies = self.prox_pool.get()
-    #     return request
-
-
-if __name__ == "__main__":
-    FirefoxDetails(redis_key="magp:details:firefox").start()

+ 0 - 150
NoteWork/details/details_login.py

@@ -1,150 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 13:25:15
----------
-@summary:  生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip
----------
-@author: 马国鹏
-"""
-
-import feapder
-from feapder.utils.tools import wechat_warning
-import execjs
-from items.spider_item import DataBakItem, MgpListItem
-from feapder.db.mongodb import MongoDB
-
-from untils.cookie_pool import LoginCookiePool
-import copy
-
-class Details(feapder.Spider):
-    _to_db = None
-    db_name = 'mgp_list'
-    send_list = []
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
-            for item in data_lsit:
-                request_params = item.get("request_params")
-                down_mid = copy.copy(item.get("down_mid"))
-                key = down_mid.get("key")
-                page_url = down_mid.get("page_url")
-                cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-                down_mid["cookie_pool"] = cookie_pool
-                print(down_mid)
-
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-
-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
-                                      deal_detail=item.get("deal_detail"),**request_params,
-                                      callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
-                self.to_db.delete(self.db_name,item)
-            break
-
-
-
-    def detail_get(self,request,response):
-        '''处理html格式的返回结果'''
-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
-            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.get("down_mid"))
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        if response.code in (request.down_mid.get("code")):
-            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.get("down_mid"))
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-
-        list_item.contenthtml = html
-        yield list_item
-
-    def detail_json(self,request,response):
-        '''处理json串及其他格式的返回结果'''
-        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
-            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.get("down_mid"))
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        if response.code in (request.down_mid.get("code")):
-            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
-            down_mid = copy.copy(request.get("down_mid"))
-            key = down_mid.get("key")
-            page_url = down_mid.get("page_url")
-            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-            cookie_pool.del_cookie(request.cookies)
-            yield request
-        items = request.item
-        list_item = DataBakItem()
-        for key in items:
-            list_item.__setitem__(key,items[key])
-        html = ''
-        exec(request.deal_detail)
-
-        list_item.contenthtml = html
-        yield list_item
-
-    def failed_request(self, request, response):
-        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
-        mgp = MgpListItem()
-        items = request.base_info
-        for key in items:
-            mgp.__setitem__(key,items[key])
-        mgp.failed +=1
-        print(f'......{mgp.failed}')
-        if mgp.pri > 5:
-            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
-                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
-                    '''
-                    根据爬虫优先级报警'''
-                    info= f'''`
-        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
-        > **爬虫名称:** {mgp.item.get("site")}
-        > **栏目名称:** {mgp.item.get("channel")}
-        > **爬虫代码:** {mgp.item.get("spidercode")}
-        > **所属管理人员:** {mgp.author}
-        请登录剑鱼爬虫管理平台查看详情。
-        `'''
-                    wechat_warning(info)
-                    self.send_list.append(mgp.item.get("site"))
-        yield mgp
-
-
-    def end_callback(self):
-        print("爬虫结束")
-    def download_midware(self, request):
-        down_mid = request.down_mid
-        key = down_mid.get("key")
-        page_url = down_mid.get("page_url")
-        cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
-        request.cookies = cookie_pool.get_cookie()
-        return request
-
-
-if __name__ == "__main__":
-    Details(redis_key="magp:details1").start()

+ 0 - 88
NoteWork/details/dtcookie_pool.py

@@ -1,88 +0,0 @@
-import json
-import re
-import sys
-
-import execjs
-
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-from untils.cookie_pool import PageCookiePool
-import requests
-
-
-class DTCookiePool(PageCookiePool):
-    def __init__(self,redis_key,header,page_url=None,
-        min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs):
-        super(DTCookiePool, self).__init__(redis_key,page_url=None,
-        min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs)
-        self.headers=header
-        self.page_url = page_url
-
-    def create_cookie(self,):
-        session = requests.Session()
-        start_url = self.page_url
-        print(self.headers)
-        res = session.get(start_url, headers=self.headers,verify=False)
-        js_func = re.findall("document.cookie=(.*?)location.href", res.text)[0]
-        js_func = 'function sd() { return ' + js_func + "}"
-        ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
-        ss = ctx.call("sd")
-        cookies = {}
-
-        for item in ss.split(";"):
-            if '=' in item:
-                cookies[item.split("=")[0]] = item.split("=")[-1]
-        res = session.get(start_url, cookies=cookies, headers=self.headers)
-        js_do_data = re.findall('};go\((.*?)\)', res.text)[0]
-        js_func = re.sub("<(/*?)script>", "", res.text)
-        location = re.compile('location(.*?)}else')
-        setTimeout = re.compile('_(.{37})setTimeout(.*?)document')
-        setTimeout2 = re.compile('setTimeout(.*?)document')
-        gox = re.compile('};go(.*?)\)')
-        js_func = re.sub(location, "}else", js_func)
-        js_func = re.sub(setTimeout, "       document", js_func)
-        js_func = re.sub(setTimeout2, "       document", js_func)
-        js_func = re.sub(gox, "   return document['cookie']\n};", js_func)
-        js_func = '''const jsdom = require("jsdom");
-        const {JSDOM} = jsdom;
-        const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
-        window = dom.window;
-        document = window.document;''' + js_func
-        ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
-        with open('ex_js.js', 'w+', encoding='utf-8') as f:
-            f.write(js_func)
-        print(js_do_data)
-        ss = ctx.call("go", json.loads(js_do_data))
-
-        for item in ss.split(";"):
-            if '=' in item:
-                cookies[item.split("=")[0]] = item.split("=")[-1]
-                session.cookies.setdefault(item.split("=")[0], item.split("=")[-1])
-        res = session.get(start_url, headers=self.headers, cookies=cookies)
-        cookies = requests.utils.dict_from_cookiejar(session.cookies)
-        return cookies
-
-if __name__ == '__main__':
-    headers = {
-    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-    "Accept-Encoding": "gzip, deflate, br",
-    "Accept-Language": "zh-CN,zh;q=0.9",
-    "Cache-Control": "max-age=0",
-    "Connection": "keep-alive",
-    "Host": "www.hefei.gov.cn",
-    "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"96\", \"Google Chrome\";v=\"96\"",
-    "sec-ch-ua-mobile": "?0",
-    "sec-ch-ua-platform": "\"Windows\"",
-    "Sec-Fetch-Dest": "document",
-    "Sec-Fetch-Mode": "navigate",
-    "Sec-Fetch-Site": "none",
-    "Sec-Fetch-User": "?1",
-    "Upgrade-Insecure-Requests": "1",
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
-}
-
-    cookie_pool = DTCookiePool(
-        page_url='https://www.hefei.gov.cn/public/column/5921?catId=6721141&nav=3&action=list&type=4&pageIndex=2',
-        header=headers, redis_key="dongtaices")
-    cookie = cookie_pool.get_cookie()
-    print(cookie)
-    # cookie_pool.del_cookie(cookie)

文件差异内容过多而无法显示
+ 0 - 1
NoteWork/details/file/sj.js


+ 0 - 34
NoteWork/details/迁移.py

@@ -1,34 +0,0 @@
-from feapder.db.mongodb import MongoDB
-
-
-class Details:
-    _to_db = None
-    _to_db_xs = None
-    db_name = 'mgp_list'
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    @property
-    def to_db_xs(self):
-        if not self._to_db_xs:
-            self._to_db_xs = MongoDB(port=27001)
-        return self._to_db_xs
-    def main(self):
-        data_lsit = self.to_db.find(self.db_name, {"parser_name": "details"},sort={"date":-1})
-        for item in data_lsit:
-            # print(item.get("item").get("publishtime"))
-            print(item.get("date"))
-            del item["_id"]
-            # # print(item)
-            if item.get("item").get("publishtime") > '2021-12-15 09:12:43':
-                print(item)
-            else:
-                # self.to_db_xs.add(self.db_name, item)
-                self.to_db.delete(self.db_name, item)
-            # self.to_db.delete(self.db_name, item)
-
-Details().main()

+ 0 - 0
spiders/__init__.py


+ 0 - 0
spiders/李宗泽/__init__.py


+ 0 - 0
spiders/马国鹏/__init__.py


+ 0 - 88
spiders/马国鹏/中国南方航空采购招标网.py

@@ -1,88 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-21 16:19:50
----------
-@summary:中国南方航空采购招标网.py
----------
-@author: 马国鹏
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from items.spider_item import DataBakItem,MgpListItem
-from untils.proxy_pool import ProxyPool
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-
-class Zgnfhkcgzbw(feapder.Spider):
-
-    def start_callback(self):
-         self.count = 0
-         Menu = namedtuple('Menu', ['channel', 'code', 'url', 'crawl_page'])
-
-         self.menus = [
-             Menu('其它公告', 'a_zgnfhkcgzbw_cggg',
-                  'https://csbidding.csair.com/cms/channel/qtgg/index.htm', 1),
-             Menu('非招标采购-采购结果', 'a_zgnfhkcgzbw_cgjg',
-                  'https://csbidding.csair.com/cms/channel/cgjg/index.htm', 2),
-             Menu('招标公告', 'a_zgnfhkcgzbw_zbgg',
-                  'https://csbidding.csair.com/cms/channel/zbgg/index.htm', 1),
-             Menu('中标公告', 'a_zgnfhkcgzbw_zhbgg',
-                  'https://csbidding.csair.com/cms/channel/bidzbgg/index.htm', 1),
-             Menu('评标公示', 'a_zgnfhkcgzbw_pbgs',
-                  'https://csbidding.csair.com/cms/channel/pbgs/index.htm', 1),
-             Menu('非招标采购-采购公告', 'a_zgnfhkcgzbw_fzbcg_cggg',
-                  'https://csbidding.csair.com/cms/channel/cggg/index.htm', 2),
-             Menu('非招标采购-其它公告', 'a_zgnfhkcgzbw_qtgg',
-                  'https://csbidding.csair.com/cms/channel/fzbqtgg/index.htm', 1),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-            for page in range(1,menu.crawl_page+1):
-                start_url = menu.url + f'?pageNo={page}'
-                yield feapder.Request(url=start_url, item=menu._asdict())
-
-    def parse(self, request, response):
-        menu = request.item
-        self.count += 1   # 一个计数器
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.xpath("//ul[@id='list1']/li")
-        for info in info_list:
-            href = info.xpath('./a/@href').extract_first()
-            title = info.xpath('./a/@title').extract_first()
-            # import pdb
-            # pdb.set_trace()
-            # print(info.xpath('./a/text()'))
-            create_time = info.xpath('./a/em/text()').extract_first()
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time.strip()  # 标书发布时间
-            data_item.site = "中国南方航空采购招标网"
-            data_item.area = "全国"  # 城市默认:全国
-            data_item.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.__table_name__= 'mgp_list'
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//div[@class="main-text"]']
-            # list_item.create_time = '//div[@class="article-author"]/text()[-1]'
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    Zgnfhkcgzbw(redis_key="fwork:Zgnfhkcgzbw").start()

+ 0 - 75
spiders/马国鹏/中国石化物质采购电子商务平台.py

@@ -1,75 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-16 15:53:39
----------
-@summary:中国石化物质采购电子商务平台.py
----------
-@author: 马国鹏
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from items.spider_item import DataBakItem,MgpListItem
-from untils.proxy_pool import ProxyPool
-from feapder.dedup import Dedup
-from collections import namedtuple
-from feapder.utils.tools import timestamp_to_date
-
-
-class Zshcg(feapder.Spider):
-    # 自定义数据库,若项目中有setting.py文件,此自定义可删除
-    def start_callback(self):
-         self.count = 0
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-         self.menus = [
-             Menu('	独家采购公示', 'a_zgshwzcgdzswpt_djcggs', "Notice", 2),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-             for page in range(1,menu.crawl_page+1):
-                start_url = f'https://ec.sinopec.com/f/supp/bid/queryOnlyBill.do?pageNo={page}&_=1639640334801'
-                yield feapder.Request(url=start_url, item=menu._asdict())
-
-    def parse(self, request, response):
-        menu = request.item
-        self.count += 1   # 一个计数器
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        # print(response.json)
-        info_list = response.json.get("result").get("result")
-        # return
-        for info in info_list:
-            href = f'https://ec.sinopec.com/f/supp/notice/viewArticle.do?id={info.get("id")}'
-            title =info.get("title")
-            create_time = timestamp_to_date(info.get("createdate").get("time")/1000)
-
-            list_item = DataBakItem()  # 存储数据的管道
-            list_item.href = href  # 标书链接
-            list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            list_item.title = title  # 标题
-            list_item.publishtime = create_time  # 标书发布时间
-            list_item.site = "中国石化物资采购电子商务平台"
-            list_item.area = "全国"  # 城市默认:全国
-            list_item.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            mgp =  MgpListItem()
-            mgp.parse = "self.detail_get"
-            mgp.parser_name = "details"
-            mgp.item = list_item.to_dict
-            mgp.deal_detail = ['//div[@class="wrap"]','//div[@id="middle"]']
-            mgp.parse_url = href
-            href_list.append(href)
-        #     yield mgp
-        # dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
-
-if __name__ == "__main__":
-    Zshcg(redis_key="fwork:{spider_name}").start()
-    # print(timestamp_to_date(1639635843,time_format="%Y-%m-%d %H:%M:%S"))

+ 0 - 98
spiders/马国鹏/中泰集团招标投标网.py

@@ -1,98 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-02-17 09:39:39
----------
-@summary: 中泰集团招标投标网
----------
-@author: maguopemng
-"""
-import sys
-
-from requests_toolbelt import MultipartEncoder
-
-
-sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider')
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from items.spider_item import DataBakItem,MgpListItem,ListItem
-from feapder.dedup import Dedup
-from collections import namedtuple
-from untils.clean_html.defaults import cleaner
-
-
-
-class AZtjtzbtbwXxggQb(feapder.Spider):
-
-    def start_callback(self):
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-         self.site= "中泰集团招标投标网"
-
-         self.menus = [
-             Menu('信息公告-全部', 'a_ztjtzbtbw_xxgg_qb', "自定义参数", 1),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-             for page in range(1,menu.crawl_page+1):
-                 start_url = f'http://scm.zthx.com/srm-pb-web/portalBulletinNoAuth/listByPageNoAuth'
-                 multipart_data = MultipartEncoder(
-                     fields={
-                     "Q_EQ_bidTypeValue": "",
-                     "Q_EQ_noticeTypeValue": "",
-                     "Quick_value": "",
-                     "S_releaseDate": "desc",
-                     "page": "2",
-                     "rows": "15"
-                 })
-                 headers = {
-                     "Content-Type": multipart_data.content_type
-                 }
-
-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False,
-                                       data=multipart_data,headers=headers)
-
-    def parse(self, request, response):
-        print(response.text)
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.json.get("rows")
-        for info in info_list:
-            href = f'http://scm.zthx.com/?id={info.get("id")}'
-            title = info.get("title")
-            create_time = info.get("releaseDate")
-            html = info.get("mainBody")
-            result = cleaner(html)
-            area = "全国"  # 省份
-            city = ""  # 城市
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = self.site
-            data_item.area = area  # 城市默认:全国
-            data_item.city = city  # 城市 默认为空
-            data_item.contenthtml = html  # 城市 默认为空
-            data_item.detail = result  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-
-            yield data_item
-        list = ListItem()
-        list.site = self.site
-        list.channel = menu.get("channel")
-        list.spidercode = menu.get("code")
-        list.url = request.url
-        list.count = len(info_list)
-        list.rel_count = len(href_list)
-        yield list
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    AZtjtzbtbwXxggQb(redis_key="maguopemng:AZtjtzbtbwXxggQb").start()

+ 0 - 133
spiders/马国鹏/中铁鲁班商务网.py

@@ -1,133 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-01-20 13:49:04
----------
-@summary: Zglbsww
----------
-@author: dist
-"""
-import json
-import sys
-
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from items.spider_item import DataBakItem,MgpListItem,ListItem
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-
-class Zglbsww(feapder.Spider):
-
-    def start_callback(self):
-         Menu = namedtuple('Menu', ['channel', 'code', 'purchaseType',"orders", 'crawl_page'])
-         self.site= "中铁鲁班商务网"
-
-         self.menus = [
-             Menu('公告补遗-招标采购', 'a_ztlbsww_zhbgg', "CRFQ","publish_time", 1),
-             Menu('公告补遗-询价采购', 'a_ztlbsww_ggby_xjcg', "XJFQ","publish_time", 1),
-             Menu('公告补遗-竞争性谈判', 'a_ztlbsww_cqby', "TPFQ","publish_time", 1),
-             Menu('公告补遗-竞价采购', 'a_ztlbsww_ggby_jjcg', "JJFQ","publish_time", 1),
-
-             Menu('采购公告-招标采购', 'a_ztlbsww_zbgg', "CRFQ","pub_time", 1),
-             Menu('采购公告-询价采购', 'a_ztlbsww_lsxjcg', "XJFQ","pub_time", 1),
-             Menu('采购公告-竞争性谈判', 'a_ztlbsww_jzxtp', "TPFQ","pub_time", 1),
-             Menu('采购公告-竞价采购', 'a_ztlbsww_jjcg', "JJFQ","pub_time", 1),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-             for page in range(1,menu.crawl_page+1):
-                 '''
-                 https://eproport.crecgec.com/epu-portal/portal/project/listWithPage
-                 https://eproport.crecgec.com/epu-portal/portal/project/listWithPage
-                 '''
-                 start_url = f'https://eproport.crecgec.com/epu-portal/portal/project/listWithPage'
-                 data = {
-                     "timeType": "month",
-                     "areaCode": "-1",
-                     "mainType": "-1",
-                     "purchaser": None,
-                     "information": None,
-                     "sTime": "",
-                     "eTime": "",
-                     "classify": "-1",
-                     "region": "-1",
-                     "level": "",
-                     "selectedState": "",
-                     "purchaseType": menu.purchaseType,
-                     "noticeType": 1,
-                     "orders": menu.orders,
-                     "dirs": "desc",
-                     "current": page,
-                     "size": 10,
-                     "page": {}
-                 }
-                 data = json.dumps(data)
-
-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False,method="POST",data=data)
-    def parse(self, request, response):
-        print(response.text)
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.json.get("data").get("records")
-        for info in info_list:
-            projectid = info.get("projectId")
-            tenantid = info.get("tenantId")
-            href = f'https://eproport.crecgec.com/#/notice/noticexj-detail?projectId={projectid}&tenantId={tenantid}'
-            title = info.get("projectName")
-            create_time = info.get("publishTime") + ":00"
-            area = "全国"  # 省份
-            city = ""  # 城市
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = self.site
-            data_item.area = area  # 城市默认:全国
-            data_item.city = city  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details_ztlbw"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//*']
-            list_item.proxies = False
-            list_item.render_time = 3
-            list_item.parse_url = href
-            list_item.pri = 1
-            list_item.files={
-                "list_xpath":'//div[@class="****"]/a',
-                "url_xpath":'./@href',
-                "name_xpath":'./text()',
-                "files_type":('zip','doxc','ftp'), # 需要下载的附件类型
-                # "file_type":'zip', # 默认的附件类型,用于url中未带附件类型的
-                "url_key":'http', # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
-                # "host":'http://www.ceshi.com',  # 需要拼接url的host
-            }
-            href_list.append(href)
-            yield list_item
-        list = ListItem()
-        list.site = self.site
-        list.channel = menu.get("channel")
-        list.spidercode = menu.get("code")
-        list.url = request.url
-        list.count = len(info_list)
-        list.rel_count = len(href_list)
-        yield list
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-    def download_midware(self, request):
-        request.headers = {
-
-            "Content-Type": "application/json"
-        }
-if __name__ == "__main__":
-    Zglbsww(redis_key="dist:Zglbsww").start()

+ 0 - 105
spiders/马国鹏/亿企优采.py

@@ -1,105 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-02-16 09:23:14
----------
-@summary: Yqyc
----------
-@author: maguopemng
-"""
-import sys
-
-sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider')
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from items.spider_item import DataBakItem, MgpListItem, ListItem
-from feapder.dedup import Dedup
-from collections import namedtuple
-from requests_toolbelt import MultipartEncoder
-import json
-
-
-class Yqyc(feapder.Spider):
-
-    def start_callback(self):
-        Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-        self.site = "亿企优采"
-
-        self.menus = [
-            Menu('竞价结果列表', 'a_yqyc_jjjglb', "bidResultList", 1),
-            Menu('待竞价列表', 'a_yqyc_djjlb', "biddingList", 1),
-        ]
-
-    def start_requests(self):
-        for menu in self.menus:
-            for page in range(1, menu.crawl_page + 1):
-                start_url = f'http://www.vins.com.cn/business/login/{menu.types}'
-                multipart_data = MultipartEncoder(
-                    fields={"page": json.dumps(
-                        {"numPerPage": 10, "pageNum": page, "condition": "LIKE", "keyword": "f01", "searchValue": "",
-                         "orderField": "", "orderDirection": "", "filterParams": {}})})
-                headers = {
-                    "Content-Type": multipart_data.content_type
-                }
-                yield feapder.Request(url=start_url, item=menu._asdict(), proxies=False,
-                                      data=multipart_data, method="POST", headers=headers)
-
-    def parse(self, request, response):
-        print(response.text)
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.json.get("data").get("records")
-        for info in info_list:
-            if menu.get("types")=="biddingList":
-                href = f'http://www.vins.com.cn/business/bidingDetail?fid={info.get("f14")}&school={info.get("f04")}'
-                title = f'待竞价详细({info.get("f01")})---- {info.get("f05")}'
-                create_time = info.get("f07")
-            else:
-                href = f'http://www.vins.com.cn/business/bidResultDetail?fid={info.get("f14")}&school={info.get("f04")}'
-                title = f'竞价结果详细({info.get("f01")})---- {info.get("f05")}'
-                create_time = info.get("f25")
-
-
-            area = "全国"  # 省份
-            city = ""  # 城市
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = self.site
-            data_item.area = area  # 城市默认:全国
-            data_item.city = city  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item = MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details_firefox"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//div[@class="contentWrapper"]']
-            list_item.proxies = False
-            list_item.render_time = 3
-            list_item.parse_url = href
-            list_item.pri = 1
-
-            href_list.append(href)
-            yield list_item
-        list = ListItem()
-        list.site = self.site
-        list.channel = menu.get("channel")
-        list.spidercode = menu.get("code")
-        list.url = request.url
-        list.count = len(info_list)
-        list.rel_count = len(href_list)
-        yield list
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-
-if __name__ == "__main__":
-    Yqyc(redis_key="maguopemng:Yqyc").start()

+ 0 - 76
spiders/马国鹏/华润置地华东大区网站.py

@@ -1,76 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-01-04 13:45:21
----------
-@summary:华润置地华东大区网站
----------
-@author: topnet
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from items.spider_item import DataBakItem,MgpListItem
-from untils.proxy_pool import ProxyPool
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-
-class Hrzdhddqwz(feapder.Spider):
-
-    def start_callback(self):
-         self.count = 0
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-         self.menus = [
-             Menu('Hrzdhddqwz', 'Hrzdhddqwz', "Notice", 1),
-             # Menu('Hrzdhddqwz', 'Hrzdhddqwz', "Notice", 1),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-            start_url = f'https://sh.crland.com.cn/shanghai1/index.html'
-            yield feapder.Request(url=start_url, item=menu._asdict())
-
-    def parse(self, request, response):
-        # print(response.text)
-        menu = request.item
-        self.count += 1   # 一个计数器
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.xpath("//div[@class='east-tender']//tr[position()>1]")
-        for info in info_list:
-            href = info.xpath('./td[2]/a/@href').extract_first()
-            title = info.xpath('./td[2]/a/text()').extract_first()
-            create_time = info.xpath('./td[4]/text()').extract_first()
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = "华润置地华东大区网站"
-            data_item.area = "上海市"  # 城市默认:全国
-            data_item.city = "上海市"  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_json"  # 虽然用的json方法,但处理的不是json型数据,因为title需要重查
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = '''
-title = response.xpath('//div[@class="east-news-detail-title"]/text()').extract_first()
-html = response.xpath('//div[@class="east-news-detail-bottom"]').extract_first()
-list_item.title = title
-list_item.contenthtml = html
-            '''
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    Hrzdhddqwz(redis_key="fwork:Hrzdhddqwz").start()

+ 0 - 120
spiders/马国鹏/南通市如皋市政府采购网上商城.py

@@ -1,120 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-02-18 13:18:40
----------
-@summary: 	南通市如皋市政府采购网上商城
----------
-@author: maguopemng
-"""
-import sys
-sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider')
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from items.spider_item import DataBakItem,MgpListItem,ListItem
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-
-class JsNtsrgszfcgwssc(feapder.Spider):
-
-    def start_callback(self):
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-         self.site= "南通市如皋市政府采购网上商城"
-
-         self.menus = [
-             Menu('分散公告', 'js_ntsrgszfcgwssc_fsgg', "自定义参数", 1),
-             # Menu('JsNtsrgszfcgwssc抓取栏目', 'JsNtsrgszfcgwssc爬虫code', "Notice", 1),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-             for page in range(1,menu.crawl_page+1):
-                 start_url = f'http://rugao.ntzfcg.cn/cgr_articles.html?category_id=5&page={page}'
-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False)
-
-    def parse(self, request, response):
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.xpath('//ul[@class="list_main"]/li')
-        for info in info_list:
-            href = info.xpath('./a/@href').extract_first()
-            title = info.xpath('./a/text()').extract()[-1].strip()
-            create_time = "20" + info.xpath('./a/span/text()').extract_first().strip()
-            area = "江苏"  # 省份
-            city = "南通市"  # 城市
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = self.site
-            data_item.area = area  # 城市默认:全国
-            data_item.city = city  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//div[@class="nes_details"]']
-            list_item.proxies = False
-            list_item.ex_python = '''
-js_str="""function randomString(e) {
-    e = e || 32;
-    var t = "ABCDEFGHJKMNPQRSTWXYZabcdefhijkmnprstwxyz2345678"
-      , n = t.length
-      , o = "";
-    for (i = 0; i < e; i++)
-        o += t.charAt(Math.floor(Math.random() * n));
-    return o
-}
-function undefind_function(nowtimes) {
-    for (var e = nowtimes, t = (new Date).getSeconds(), i = 100 * Number(e) + t, n = parseInt(Number(i) / 1e3), o = new Array(4), r = 3; 0 <= r; r--)
-        3 == r ? o[3] = Number(i) % 1e3 : (o[r] = n % 1e3,
-        n = parseInt(n / 1e3));
-    var s = o.map(function(e) {
-        var t, i = [1, 3, 5, 7, 9], n = [0, 2, 4, 6, 8];
-        return e < 480 ? (e = 1e3 - e,
-        t = i[Math.floor(Math.random() * i.length)]) : t = n[Math.floor(Math.random() * n.length)],
-        (randomString(2) + e.toString(16) + t).toUpperCase()
-    }).join("-")
-      , a = parseInt(t / 10)
-      , l = t % 10
-      , c = a * l * 100 + 10 * (a + 1) + (9 - l);
-    return "_new_rugao_session=" + s + "-" + randomString(4) + c
-}"""
-ctx = execjs.compile(js_str)
-cookie = ctx.call("undefind_function",str(int(time.time())))  
-request_params["headers"]={"Cookie":cookie}         
-
-'''
-            list_item.parse_url = href
-            list_item.pri = 1
-            list_item.files={
-                "list_xpath":'//span[@class="font16 cgr_ar_content  mb29"]/a',
-                "url_xpath":'./@href',
-                "name_xpath":'./text()',
-                "file_type":'file_name',
-                "files_type":('zip','doxc','ftp','rar','pdf','xlsx','doc','jpg'), # 需要下载的附件类型
-                "url_key":'attachments', # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
-                "host":'http://rugao.ntzfcg.cn',  # 需要拼接url的host
-            }
-            href_list.append(href)
-            yield list_item
-        list = ListItem()
-        list.site = self.site
-        list.channel = menu.get("channel")
-        list.spidercode = menu.get("code")
-        list.url = request.url
-        list.count = len(info_list)
-        list.rel_count = len(href_list)
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    JsNtsrgszfcgwssc(redis_key="maguopemng:JsNtsrgszfcgwssc").start()

+ 0 - 101
spiders/马国鹏/天津市政府采购网.py

@@ -1,101 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-13 10:04:03
----------
-@summary:天津市政府采购网.py
----------
-@author: 马国鹏
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from items.spider_item import DataBakItem,MgpListItem
-from untils.proxy_pool import ProxyPool
-from feapder.dedup import Dedup
-from collections import namedtuple
-from feapder.utils.tools import format_date
-import time
-
-class Tjszf(feapder.Spider):
-    # 自定义数据库,若项目中有setting.py文件,此自定义可删除
-    def start_callback(self):
-         self.count = 0
-         self.prox_pool = ProxyPool()
-         Menu = namedtuple('Menu', ['channel', 'code', 'id','st', 'crawl_page'])
-
-         self.menus = [
-             Menu('采购公告市级',        'tj_tjszfcgw_cggg_sj', "1665", 1,   1),
-             Menu('采购公告区县',        'tj_tjszfcgw_cggg_qj', "1664",None, 1),
-             Menu('采购结果公告市级',     'tj_tjszfcgw_cgjggg_sj', "2014", 1,   1),
-             Menu('采购结果公告区县',     'tj_tjszfcgw_cgjggg_qx', "2013",None, 1),
-             Menu('采购需求征求意见市级',  'tj_tjszfcgw_cgxqzqyj_sj', "1662",1, 1),
-             Menu('采购需求征求意见区县', 'tj_tjszfcgw_cgxqzqyj_qj', "1994", None, 1),
-             Menu('单一来源公示-市级',    'tj_tjszfcgw_cgxqzqyj_sj', "2033", 1,   1),
-             Menu('单一来源公示-区级',    'tj_tjszfcgw_dylygs_qx', "2034", None, 1),
-             Menu('更正公告市级',        'tj_tjszfcgw_gzgg_sj', "1663", 1, 1),
-             Menu('更正公告区县',        'tj_tjszfcgw_gzgg_qx', "1666", None, 1),
-             Menu('合同验收公告市级',     'tj_tjszfcgw_htysgg_sj', "2015", 1, 1),
-             Menu('合同验收公告区县',     'tj_tjszfcgw_htysgg_qx', "2016", None, 1),
-             Menu('监督检查处理决定公告-市级','tj_tjszfcgw_jdjccjjdgg_sj', "5776730", 1, 1),
-             Menu('监督检查处理决定公告-区级','tj_tjszfcgw_jdjccjjdgg_qj', "5903537", None, 1),
-             Menu('投诉处理决定-市级',     'tj_tjszfcgw_tscljd', "5776729", None, 1),
-             Menu('投诉处理决定公告-区级',  'tj_tjszfcgw_tscljd_qj', "5903425", None, 1),
-             Menu('采购意向公开-市级',  'tj_tjszfcgw_cgyxgk_sj', "2021", 1, 1),
-             Menu('采购意向公开-区级',  'tj_tjszfcgw_cgyxgk_qj', "2022", None, 1),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-            stmp = int(time.time()*1000)
-            start_url = f'http://www.ccgp-tianjin.gov.cn/portal/topicView.do?method=view&view=Infor&id={menu.id}&ver=2{"&st"+str(menu.st) if menu.st else ""}&stmp={stmp}'
-            yield feapder.Request(url=start_url, item=menu._asdict())
-
-    def parse(self, request, response):
-        # print(response.text)
-        info_list = response.xpath('//ul[@class="dataList"]/li')
-        menu = request.item
-        self.count += 1   # 一个计数器
-        href_list = []
-        dedup = Dedup(Dedup.BloomFilter, absolute_name="boolm:list")
-        for info in info_list:
-            create_time = info.xpath("./span/text()").extract_first()
-            create_time = format_date(create_time, "%a %b %d %H:%M:%S CST %Y")
-            href = info.xpath("./a/@href").extract_first()
-            title = info.xpath("./a/@title").extract_first()
-            list_item = DataBakItem()  # 存储数据的管道
-            list_item.href = href  # 标书链接
-            list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            list_item.title = title  # 标题
-            list_item.publishtime = create_time  # 标书发布时间
-            list_item.site = "天津市政府采购网"
-            list_item.area = "天津市"  # 城市默认:全国
-            list_item.city = "天津市"  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            mgp =  MgpListItem()
-            mgp.parse = "self.detail_get"
-            mgp.parser_name = "details"
-            mgp.item = list_item.to_dict
-            # mgp.author = '马国鹏'
-            mgp.deal_detail = ['//table',"//div[@class='pageInner']"]
-            mgp.parse_url = href
-            href_list.append(href)
-            yield mgp
-        dedup.add(href_list)
-
-
-    def end_callback(self):
-        print("爬虫结束")
-        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
-
-    def download_midware(self, request):
-        request.proxies = self.prox_pool.get()
-        return request
-
-if __name__ == "__main__":
-    Tjszf(redis_key="magp:tjszfcg").start()
-'''
-imageString=67&method=downEnIdFile1&id=301079006&fileId=LwQVvvUfo5A*
-
-'''

+ 0 - 137
spiders/马国鹏/广东省政府采购网.py

@@ -1,137 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-01-18 09:41:49
----------
-@summary: Gdszfcgw
----------
-@author: dist
-"""
-import sys
-from urllib.parse import urljoin
-
-import requests
-
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder,time
-from items.spider_item import DataBakItem,MgpListItem,ListItem
-from feapder.dedup import Dedup
-from collections import namedtuple
-from untils.get_imgcode import get_code
-#
-# # custom_settings = { 'DOWNLOAD_DELAY': 10, 'CONCURRENT_REQUESTS_PER_IP': 4, 'DOWNLOADER_MIDDLEWARES': {}, }
-# settings = { 'LOG_LEVEL': "INFO" }
-class Gdszfcgw(feapder.Spider):
-    def start_callback(self):
-         Menu = namedtuple('Menu', ['channel', 'code', 'noticetype','notchannel', 'crawl_page'])
-         self.site= "广东省政府采购网"
-         self.host = 'https://gdgpo.czt.gd.gov.cn'
-
-         self.menus = [
-             Menu('采购意向公开', 'gd_gdszfcgwxwz_cgyxgk','59','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
-             Menu('单一来源公示', 'gd_gdszfcgwxwz_cggg_pccgyxgk','001051','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
-             Menu('采购计划', 'gd_gdszfcgwxwz_cgjh', '001101','95ff31f3-a1af-4bc4-b1a2-54c894476193', 1),   #1
-             Menu('采购需求', 'gd_gdszfcgwxwz_cgxq', '001059','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
-             Menu('资格预审公告', 'gd_gdszfcgwxwz_zgysgg', '001052,001053','fca71be5-fc0c-45db-96af-f513e9abda9d', 1), #2
-             Menu('采购公告', 'gd_gdszfcgwxwz_cggg', '00101','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
-             Menu('中标成交公告', 'gd_gdszfcgwxwz_zbcjgg', '00102','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
-             Menu('更正公告', 'gd_gdszfcgwxwz_gzgg', '00103','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
-             Menu('终止公告', 'gd_gdszfcgwxwz_zzgg', '001004,001006','fca71be5-fc0c-45db-96af-f513e9abda9d', 1), #3
-             Menu('合同公告', 'gd_gdszfcgwxwz_htgg', '001054','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),
-             Menu('验收公告', 'gd_gdszfcgwxwz_ysgg', '001009,00105A','fca71be5-fc0c-45db-96af-f513e9abda9d', 1),  #4
-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '201022,201023,201111,00107D','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '202022,202023,202111,00107E,001076','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '001071','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '204022,204023,204111,204112','3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  #4
-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '001054', '3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  # 4
-             Menu('电子卖场信息', 'gd_gdszfcgwxwz_ysgg', '001009,00105A', '3b49b9ba-48b6-4220-9e8b-eb89f41e9d66', 1),  # 4
-
-             # Menu('批量采购', 'gd_gdszfcgwxwz_plcg',
-             #      'https://gdgpo.czt.gd.gov.cn/freecms/site/guangdong/dzmcgg/index.html', 1),
-             # Menu('进口产品清单', 'gd_gdszfcgwxwz_jkcpqd',
-             #      'https://gdgpo.czt.gd.gov.cn/freecms/site/guangdong/jkcpqd/index.html','','d7284b7e-29e9-4fe4-bad3-b187ec8edbf9' 1),
-         ]
-    def start_requests(self):
-        code = self.get_code()
-        for menu in self.menus:
-             for page in range(1,menu.crawl_page+1):
-                 start_url = f'https://gdgpo.czt.gd.gov.cn/freecms/rest/v1/notice/selectInfoMoreChannel.do?&siteId=cd64e06a-21a7-4620-aebc-0576bab7e07a&channel={menu.notchannel}&currPage={page}&pageSize=10&noticeType={menu.noticetype}&regionCode=440001&verifyCode={code}&subChannel=false&purchaseManner=&title=&openTenderCode=&purchaser=&agency=&purchaseNature=&operationStartTime=&operationEndTime=&selectTimeName=noticeTime'
-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False)
-    def get_code(self):
-        img_url = 'https://gdgpo.czt.gd.gov.cn/freecms/verify/verifyCode.do?createTypeFlag=n'
-        header = {"Host": "www.ccgp-tianjin.gov.cn",
-                  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0",
-                  "Origin": "http://www.ccgp-tianjin.gov.cn",
-
-                  }
-        res = requests.get(img_url, headers=header)
-        with open('image/guangdong.jpg', 'wb+') as f:
-            f.write(res.content)
-        res = get_code('image/guangdong.jpg')
-        if res.get("msg")=="success":
-            img_code = res.get("r").get("code")
-        else:
-            img_code = None
-        return img_code
-
-
-    def parse(self, request, response):
-        time.sleep(0.3)
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.json.get("data")
-        for info in info_list:
-            href = info.get("pageurl")
-            title = info.get("shorttitle")
-            create_time = info.get("addtimeStr")
-            href = urljoin(self.host, href)
-
-            area = "广东"  # 省份
-            city = ""  # 城市
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = self.site
-            data_item.area = area  # 城市默认:全国
-            data_item.city = city  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//div[@class="info-article in active"]']
-            list_item.proxies = False
-            list_item.parse_url = href
-            list_item.pri = 1
-            list_item.files={
-                "list_xpath":'//div[@class="info-article in active"]//div/a',
-                "url_xpath":'./@href',
-                "name_xpath":'./text()',
-                "files_type":('zip','doxc','ftp','pdf'), # 需要下载的附件类型
-                # "file_type":'zip', # 默认的附件类型,用于url中未带附件类型的
-                "url_key":'http', # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
-                # "host":'http://www.ceshi.com',  # 需要拼接url的host
-            }
-            href_list.append(href)
-            yield list_item
-        list = ListItem()
-        list.site = self.site
-        list.channel = menu.get("channel")
-        list.spidercode = menu.get("code")
-        list.url = request.url
-        list.count = len(info_list)
-        list.rel_count = len(href_list)
-        yield list
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    Gdszfcgw(redis_key="dist:Gdszfcgw").start()

+ 0 - 75
spiders/马国鹏/广发证券采购平台.py

@@ -1,75 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-01-04 13:12:42
----------
-@summary: 广发证券采购平台
----------
-@author: topnet
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-from urllib.parse import urljoin
-
-import feapder
-from items.spider_item import DataBakItem,MgpListItem
-from untils.proxy_pool import ProxyPool
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-
-class Gfzqcgpt(feapder.Spider):
-
-    def start_callback(self):
-         self.count = 0
-         self.host = 'https://gfjc.gf.com.cn'
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-         self.menus = [
-             Menu('公告公示信息', 'a_gfzqcgpt_gggsxx', "gonggao", 1),
-             # Menu('Gfzqcgpt', 'Gfzqcgpt', "Notice", 1),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-             for page in range(1,menu.crawl_page+1):
-                start_url = f'https://gfjc.gf.com.cn/gonggao/index_{page}.jhtml'
-                yield feapder.Request(url=start_url, item=menu._asdict())
-
-    def parse(self, request, response):
-        menu = request.item
-        self.count += 1   # 一个计数器
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.xpath('//div[@class="list-news-mode"]')
-        for info in info_list:
-            href = urljoin(self.host, info.xpath('./div/a/@href').extract_first())
-            title = info.xpath('./div/a/text()').extract_first()
-            create_time = info.xpath('./div/div/span[3]/text()').extract_first()
-            create_time = create_time.split(":")[-1]
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = "广发证券采购平台"
-            data_item.area = "全国"  # 城市默认:全国
-            data_item.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//div[@class="list-news-box"]']
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    Gfzqcgpt(redis_key="fwork:Gfzqcgpt").start()

+ 0 - 110
spiders/马国鹏/杭州市公共资源交易.py

@@ -1,110 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-11-26 16:28:18
----------
-@summary: 杭州市公共资源交易
----------
-@author: 马国鹏
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-from collections import namedtuple
-import feapder
-import time
-from feapder.dedup import Dedup
-from items.spider_item import DataBakItem, MgpListItem
-
-
-class Hzsggzy(feapder.Spider):
-    # 自定义数据库,若项目中有setting.py文件,此自定义可删除
-    def start_callback(self):
-        self.start_url = 'https://ggzy.hzctc.hangzhou.gov.cn/SecondPage/GetNotice'
-        self.count = 0
-
-    def start_requests(self):
-        Menu = namedtuple('Menu', ['channel', 'code', 'afficheType', 'crawl_page'])
-        menus = [
-            # Menu('工程建设-项目合同', 'zj_hzsggzyjyw_gcjs_xmht', "506", 1, ),
-            # Menu('工程建设-招标文件预公示', 'zj_hzsggzyjyw_gcjs_zbwjygs', "505", 2, ),
-            # Menu('工程建设-核准信息公告', 'zj_hzsggzyjyw_gcjs_hzxxgg', "518", 1, ),
-            # Menu('政府采购-更正答疑', 'zj_hzsggzyjy_zfcg_gzdy2', "27", 1, ),
-            Menu('政府采购-采购公告', 'zj_hzsggzyjy_zfcg_cggg2', "29", 2, ),
-            Menu('综合其他-中标结果公告', 'zj_hzsggzyjyw_zhqt_zbjggg', "507", 1, ),
-            Menu('综合其他-中标前公示', 'zj_hzsggzyjyw_zhqt_zbqgs', "37", 1, ),
-            Menu('综合其他-答疑文件', 'zj_hzsggzyjyw_zhqt_dywj', "499",1, ),
-            Menu('综合其他-答疑公告', 'zj_hzsggzyjyw_zhqt_dygg', "469", 1, ),
-            Menu('综合其他-招标公告', 'zj_hzsggzyjyw_zhqt_zbgg', "34", 1, ),
-
-            Menu('工程建设-招标公告', 'zj_hzsggzyjy_gcjs_zbgg', "22", 1, ),
-            Menu('工程建设-答疑文件', 'zj_hzsggzyjy_gcjs_dywj', "23", 1, ),
-            Menu('工程建设-答疑公告', 'zj_hzsggzyjy_gcjs_dygg', "465", 1, ),
-            Menu('工程建设-开标结果公示', 'zj_hzsggzyjy_gcjs_kbjggs', "486", 1, ),
-            Menu('工程建设-中标前公示', 'zj_hzsggzyjy_gcjs_zhbqgs', "25", 1, ),
-            Menu('工程建设-中标公告', 'zj_hzsggzyjy_gcjs_zbgs', "28", 1, ),
-
-            Menu('政府采购-意见征询', 'zj_hzsggzyjy_zfcg_yjzx', "26", 1, ),
-            Menu('政府采购-答疑公告', 'zj_hzsggzyjy_zfcg_dygg', "466", 1, ),
-            Menu('政府采购-结果公告', 'zj_hzsggzyjy_zfcg_jggg', "32", 1, ),
-
-        ]
-        for menu in menus:
-            for page in range(1,menu.crawl_page+1):
-
-                data = {
-                    "area":"",
-                    "afficheType":menu.afficheType,
-                    "IsToday":"",
-                    "title":"",
-                    "proID":"",
-                    "number":"",
-                    "_search":"false",
-                    "nd":int(time.time()*1000),
-                    "rows":"10",
-                    "page":page,
-                    "sidx":"PublishStartTime",
-                    "sord":"desc"
-                }
-
-                yield feapder.Request(url=self.start_url,data=data,method="POST",item=menu._asdict(),verify=False,proxies=False)
-
-    def parse(self, request, response):
-        menu = request.item
-        href_list = []
-        dedup = Dedup(Dedup.BloomFilter)
-        info_list =response.json.get("rows")
-        for info in info_list:
-            info_id = info.get("ID")
-            tenderno = info.get("TenderNo")
-            title = info.get("TenderName")
-            create_time = info.get("PublishStartTime")
-            inner = info.get("IsInner")
-            href = f'https://ggzy.hzctc.hangzhou.gov.cn/AfficheShow/Home?AfficheID={info_id}&IsInner={inner}&ModuleID={menu.get("afficheType")}'
-            data_item = DataBakItem()
-            data_item.href = href
-            data_item.title = title
-            data_item.publishtime = create_time
-            data_item.channel = menu.get("channel")
-            data_item.spidercode = menu.get("code")
-            data_item.site = "杭州市公共资源交易"
-            data_item.area = "浙江"
-            data_item.city = "杭州市"
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item = MgpListItem()
-            # list_item.__table_name__ = 'mgp_list'
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ["//div[@class='content']"]
-            # list_item.create_time = '//div[@class="article-author"]/text()[-1]'
-            list_item.parse_url = href
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-
-
-
-
-if __name__ == "__main__":
-    Hzsggzy(redis_key="mgp:hzsggzy",debug=True).start()

+ 0 - 99
spiders/马国鹏/武汉市公共资源交易平台.py

@@ -1,99 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-12-29 10:06:02
----------
-@summary:  武汉市公共资源交易平台
----------
-@author: topnet
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from items.spider_item import DataBakItem,MgpListItem
-from untils.cookie_pool import PageCookiePool
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-
-class Whsggzyjypt(feapder.Spider):
-
-    cookie_pool = PageCookiePool(redis_key='fwork:Whsggzyjypt',page_url='https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoListInit.do')
-
-    def start_callback(self):
-         self.count = 0
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-         self.menus = [
-             Menu('资格预审公示', 'hb_whsggzyjypt_zgysgs', "Notice", 3), # 300页历史数据
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-             start_url = f'https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoList.do'
-             for page in range(1,menu.crawl_page+1):
-                 data = {
-                     "page": page,
-                     "rows": "10"
-                 }
-                 yield feapder.Request(url=start_url, data=data, method="POST", item=menu._asdict())
-
-    def parse(self, request, response):
-        if '当前操作存在安全风险' in response.text:
-            self.cookie_pool.del_cookie(request.cookies)
-            yield request
-        menu = request.item
-        self.count += 1   # 一个计数器
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.json.get("rows")
-        for info in info_list:
-            href = f'https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoDetail.do?%id={info.get("id")}'
-            title = info.get("prjName")
-            create_time = info.get("insertDate")
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = "武汉市公共资源交易平台"
-            data_item.area = "湖北省"  # 城市默认:全国
-            data_item.city = "武汉市"  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            # if ss == []:
-            #     continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details_cookie"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//div[@class="pageRight_box"]']
-            list_item.parse_url = href
-            list_item.down_mid = {"key":'fwork:Whsggzyjypt',"text":"当前操作存在安全风险","code":(404,500),
-                                  "page_url":'https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoListInit.do'}
-            href_list.append(href)
-            yield list_item
-        dedup.add(href_list)
-    def download_midware(self, request):
-        request.headers = {
-            "Sec-Fetch-Mode": "cors",
-            "Sec-Fetch-Site": "same-origin",
-            "Origin": "https://www.whzbtb.com",
-            "Accept-Encoding": "gzip, deflate, br",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36 Core/1.77.81.400 QQBrowser/10.9.4608.400",
-            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
-            "Accept": "application/json, text/javascript, */*; q=0.01",
-            "Referer": "https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoListInit.do",
-            "X-Requested-With": "XMLHttpRequest",
-            "Connection": "keep-alive"
-        }
-
-        request.cookies = self.cookie_pool.get_cookie()
-        return request
-
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    Whsggzyjypt(redis_key="fwork:Whsggzyjypt").start()

+ 0 - 132
spiders/马国鹏/湖北省政府采购网.py

@@ -1,132 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-02-16 11:31:01
----------
-@summary: HbHbszfcgwCgyxgg
----------
-@author: maguopemng
-"""
-import sys
-sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider')
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from items.spider_item import DataBakItem,MgpListItem,ListItem
-from feapder.dedup import Dedup
-from collections import namedtuple
-from untils.cookie_pool import PageCookiePool
-
-
-class HbHbszfcgwCgyxgg(feapder.Spider):
-    cookiepool = PageCookiePool(redis_key='fwork:gszfcg',
-                            page_url='http://www.ccgp-hubei.gov.cn:9040/quSer/initSearch')
-
-    def start_callback(self):
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-         self.site= "湖北省政府采购网"
-         self.menus = [
-             Menu('采购意向公告', 'hb_hbszfcgw_cgyxgg', "自定义参数", 1),
-         ]
-
-    def start_requests(self):
-         for menu in self.menus:
-             for page in range(1,menu.crawl_page+1):
-                 start_url = f'http://www.ccgp-hubei.gov.cn:9040/quSer/search'
-                 data = {
-                     "queryInfo.type": "cgyx",
-                     "queryInfo.key": "",
-                     "queryInfo.xmmc": "",
-                     "queryInfo.cgdw": "",
-                     "queryInfo.city": "湖北省",
-                     "queryInfo.qybm": "42????",
-                     "queryInfo.district": "全省",
-                     "queryInfo.je1": "",
-                     "queryInfo.begin": "",
-                     "queryInfo.end": "",
-                     "queryInfo.pageNo": "3",
-                     "queryInfo.pageSize": "15",
-                     "queryInfo.pageTotle": "2950"
-                 }
-                 headers = {
-                     "Content-Type": "application/x-www-form-urlencoded",
-                 }
-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False,data=data,method="POST",headers=headers)
-
-    def parse(self, request, response):
-        if '查询失败,请重新查询' in response.text:
-            self.cookiepool.del_cookie(request.cookies)
-            yield request
-        print(response.text)
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.xpath('//tbody/tr')
-        for info in info_list:
-            href = info.xpath('./td[last()]/a/@href').extract_first()
-            title = info.xpath('./td[2]/text()').extract_first()
-            create_time = info.xpath('./td[5]/text()').extract_first()
-            area = "湖北"  # 省份
-            city = ""  # 城市
-            print(title,create_time,href)
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = self.site
-            data_item.area = area  # 城市默认:全国
-            data_item.city = city  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_json"
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.proxies = False
-            list_item.deal_detail = '''
-html = response.xpath('//div[@style="margin: 0 22px;"]').extract_first()   
-list_item.contenthtml=html
-files =  response.xpath('//ul[@class="list-unstyled details-ul"]/li')
-if len(files) > 0:
-    attachments = {}
-    for index, info in enumerate(files):
-        file_id = info.xpath('./a/@href').extract_first().strip("javascript:downloadFile();Base64").strip("'")
-        file_name = info.xpath('./a/@download').extract_first()
-        import base64
-        file_url = 'http://www.ccgp-hubei.gov.cn:8090/gpmispub/download?id=' + base64.b64encode(file_id.encode('utf-8')).decode()
-        file_type = file_name.split(".")[-1].lower()
-        file_name = file_name.split(".")[0]
-        print(file_type)
-        print(file_url)
-        attachment = AttachmentDownloader().fetch_attachment(
-            file_name=file_name, file_type=file_type, download_url=file_url,
-            enable_proxy=False)
-        attachments[str(len(attachments) + 1)] = attachment
-        print(attachment)
-        if len(attachments) == 0:
-            pass
-        else:
-            list_item.projectinfo = {"attachments": attachments}         
-            '''
-            list_item.parse_url = href
-            list_item.pri = 1
-            href_list.append(href)
-            yield list_item
-        list = ListItem()
-        list.site = self.site
-        list.channel = menu.get("channel")
-        list.spidercode = menu.get("code")
-        list.url = request.url
-        list.count = len(info_list)
-        list.rel_count = len(href_list)
-        dedup.add(href_list)
-    def download_midware(self, request):
-        request.cookies = self.cookiepool.get_cookie()
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    HbHbszfcgwCgyxgg(redis_key="maguopemng:HbHbszfcgwCgyxgg").start()

+ 0 - 113
spiders/马国鹏/滁州市人民政府网.py

@@ -1,113 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-01-14 20:02:21
----------
-@summary: 滁州市人民政府网
----------
-@author: mgp
-"""
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-import feapder
-from items.spider_item import DataBakItem,MgpListItem,ListItem
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-
-class Czsrmzf(feapder.Spider):
-
-    def start_callback(self):
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-         self.site= "滁州市人民政府网"
-
-         self.menus = [
-             Menu('政府信息公开目录-公立医疗机构药品医用设备采购', 'ah_czsrmzfw_gcztb_zbgg', "自定义参数", 1),
-             Menu('重大建设项目-招标投标信息', 'ah_czsrmzfw_zfcg_cggg', "自定义参数", 1),
-             Menu('政府采购', 'ah_czsrmzfw_gcztb_zbgs', "Notice", 1),
-             Menu('工程建设招投标', 'ah_czsrmzfw_zfcg_zbcjgg', "Notice", 1),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-             for page in range(1,menu.crawl_page+1):
-                 start_url = f'https://www.chuzhou.gov.cn/chuzhou/site/label/8888'
-                 parmars = params = {
-                        "IsAjax": "1",
-                        "dataType": "html",
-                        "_": "0.5840033326645138",
-                        "labelName": "publicInfoList",
-                        "siteId": "2653861",
-                        "pageSize": "20",
-                        "pageIndex": "3",
-                        "action": "list",
-                        "isDate": "true",
-                        "dateFormat": "yyyy-MM-dd",
-                        "length": "50",
-                        "organId": "2681509",
-                        "type": "4",
-                        "catId": "161735369",
-                        "cId": "",
-                        "result": "暂无相关信息",
-                        "title": "",
-                        "fileNum": "",
-                        "keyWords": "",
-                        "file": "/c1/chuzhou/publicInfoList_newest"
-                    }
-                 yield feapder.Request(url=start_url,params=parmars, item=menu._asdict(),proxies=False)
-
-    def parse(self, request, response):
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.xpath("//ul")
-        for info in info_list:
-            href = info.xpath("./li/a/@href").extract_first().strip()
-            title = info.xpath("./li/a/@title").extract_first().strip()
-            create_time = info.xpath("./li/span/text()").extract_first().strip()
-            area = "安徽"  # 省份
-            city = "滁州市"  # 城市
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = self.site
-            data_item.area = area  # 城市默认:全国
-            data_item.city = city  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
-            list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//div[@class="contentbox minh500"]']
-            list_item.proxies = False
-            list_item.parse_url = href
-            list_item.pri = 1
-            list_item.files={
-                "list_xpath":'//a[contains(@data-file-ext,"D")]',
-                "url_xpath":'./@href',
-                "name_xpath":'./text()',
-                "files_type":('zip','docx','ftp'), # 需要下载的附件类型
-                "url_key": 'http',  # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
-                "host": 'https://www.chuzhou.gov.cn'
-            }
-            href_list.append(href)
-            yield list_item
-        list = ListItem()
-        list.site = self.site
-        list.channel = menu.get("channel")
-        list.spidercode = menu.get("code")
-        list.url = request.url
-        list.count = len(info_list)
-        list.rel_count = len(href_list)
-        yield list
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    Czsrmzf(redis_key="magp:Czsrmzf").start()

+ 0 - 92
spiders/马国鹏/玖隆在线_交易公告.py

@@ -1,92 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022-02-15 14:01:43
----------
-@summary: Jlzx
----------
-@author: maguopemng
-"""
-import sys
-
-
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider')
-import feapder
-from items.spider_item import DataBakItem,MgpListItem,ListItem
-from untils.clean_html.defaults import cleaner
-from feapder.dedup import Dedup
-from collections import namedtuple
-
-class AJlzxJygg(feapder.Spider):
-
-    def start_callback(self):
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-         self.site= "玖隆在线"
-
-         self.menus = [
-             Menu('交易公告', 'a_jlzx_jygg', "自定义参数", 1),
-         ]
-    def start_requests(self):
-         for menu in self.menus:
-             for page in range(1,menu.crawl_page+1):
-                 start_url = "http://www.e9656.com/portaletm-2.0.0//dataViewAjax!show.htm"
-                 params = {
-                     "callback": "",
-                     "ajaxParam.esbService": "afficheService.queryAfficheAll",
-                     "ajaxParam.esbParam": "%5B%7B%22cmemberCode%22%3A%22S000016%22%2C%22queryOrderStr1%22%3A%22afficheDate%20desc%22%7D%5D",
-                     "paging.limit": "12",
-                     "paging.start": "0",
-                     "ajaxParam.retClass": "com.soft.bc.oamsg.affiche.vo.QueryAffiche",
-                     "ajaxParam.esbParamClass": "[\"com.soft.bc.oamsg.affiche.vo.QueryBean\"]",
-                     "ajaxParam.esbParamName": "[\"queryBean\"]",
-                     "ajaxParam.resultParamName": "data",
-                     "ajaxParam.callbackParam": "{\"maskPlace\":\"$(\\\"div[name='doclist'][id='jygg'],span[name='doclist'][id='jygg']\\\")\"}"
-                 }
-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False,params=params)
-
-    def parse(self, request, response):
-        print(response.json)
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.json.get("data").get("data").get("list")
-        for info in info_list:
-            href = f'https://www.e9656.com/trade//auctionHallAction!getOaAffiche.htm?glistTempbatch={info.get("afficheExbillno")}'
-            title = info.get("afficheTitle")
-            create_time = info.get("afficheEdate")
-            html = info.get("afficheContent")
-            result = cleaner(html)
-
-            area = "江苏"  # 省份
-            city = ""  # 城市
-
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = self.site
-            data_item.area = area  # 城市默认:全国
-            data_item.city = city  # 城市 默认为空
-            data_item.contenthtml = html  # 城市 默认为空
-            data_item.detail = result  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            yield data_item
-        list = ListItem()
-        list.site = self.site
-        list.channel = menu.get("channel")
-        list.spidercode = menu.get("code")
-        list.url = request.url
-        list.count = len(info_list)
-        list.rel_count = len(href_list)
-        yield list
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
-
-if __name__ == "__main__":
-    AJlzxJygg(redis_key="maguopemng:AJlzxJygg").start()

部分文件因为文件数量过多而无法显示