瀏覽代碼

update:优化若干内容与方法

dongzhaorui 2 年之前
父節點
當前提交
73209a47a7
共有 72 個文件被更改,包括 2443 次插入4924 次删除
  1. 15 2
      FworkSpider/feapder/VERSION
  2. 29 33
      FworkSpider/feapder/buffer/__init__.py
  3. 5 5
      FworkSpider/feapder/buffer/request_buffer.py
  4. 14 8
      FworkSpider/feapder/commands/create/create_spider.py
  5. 5 5
      FworkSpider/feapder/commands/shell.py
  6. 10 64
      FworkSpider/feapder/core/__init__.py
  7. 42 25
      FworkSpider/feapder/core/collector.py
  8. 1 1
      FworkSpider/feapder/core/handle_failed_requests.py
  9. 499 453
      FworkSpider/feapder/core/parser_control.py
  10. 109 97
      FworkSpider/feapder/core/scheduler.py
  11. 16 2
      FworkSpider/feapder/core/spiders/__init__.py
  12. 3 18
      FworkSpider/feapder/core/spiders/air_spider.py
  13. 149 16
      FworkSpider/feapder/core/spiders/spider.py
  14. 1 5
      FworkSpider/feapder/db/__init__.py
  15. 107 76
      FworkSpider/feapder/db/mysqldb.py
  16. 14 20
      FworkSpider/feapder/dedup/bitarray.py
  17. 13 2
      FworkSpider/feapder/dedup/expirefilter.py
  18. 0 178
      FworkSpider/feapder/dedup/old__init__.py
  19. 40 73
      FworkSpider/feapder/network/__init__.py
  20. 0 20
      FworkSpider/feapder/network/item.py
  21. 0 20
      FworkSpider/feapder/network/proxy_file/a62f3217a0981b7b2117d9d0af64c2db.txt
  22. 5 9
      FworkSpider/feapder/network/proxy_pool.py
  23. 23 37
      FworkSpider/feapder/network/request.py
  24. 0 513
      FworkSpider/feapder/network/request6.29.py
  25. 2 1
      FworkSpider/feapder/network/response.py
  26. 6 4
      FworkSpider/feapder/setting.py
  27. 98 30
      FworkSpider/feapder/templates/air_spider_template.tmpl
  28. 61 45
      FworkSpider/feapder/templates/project_template/CHECK_DATA.md
  29. 0 177
      FworkSpider/feapder/utils/__init__.py
  30. 1 1
      FworkSpider/feapder/utils/custom_argparse.py
  31. 53 41
      FworkSpider/feapder/utils/js/stealth.min.js
  32. 14 8
      FworkSpider/feapder/utils/metrics.py
  33. 37 32
      FworkSpider/feapder/utils/tools.py
  34. 167 72
      FworkSpider/feapder/utils/webdriver.py
  35. 71 53
      FworkSpider/items/__init__.py
  36. 115 111
      FworkSpider/items/spider_item.py
  37. 0 0
      FworkSpider/login_pool/__init__.py
  38. 0 95
      FworkSpider/login_pool/zglbw.py
  39. 0 56
      FworkSpider/mongo_pipeline.py
  40. 0 98
      FworkSpider/mongo_pipeline_old.py
  41. 89 151
      FworkSpider/setting.py
  42. 13 30
      FworkSpider/untils/WebCookiePool.py
  43. 13 2
      FworkSpider/untils/__init__.py
  44. 212 169
      FworkSpider/untils/attachment.py
  45. 0 61
      FworkSpider/untils/chaojiying.py
  46. 0 0
      FworkSpider/untils/clean_html/__init__.py
  47. 0 131
      FworkSpider/untils/clean_html/defaults.py
  48. 0 136
      FworkSpider/untils/cleaner.py
  49. 62 654
      FworkSpider/untils/cookie_pool.py
  50. 0 33
      FworkSpider/untils/create_menus.py
  51. 11 15
      FworkSpider/untils/execptions.py
  52. 129 12
      FworkSpider/untils/get_imgcode.py
  53. 2 762
      FworkSpider/untils/proxy_pool.py
  54. 186 123
      FworkSpider/untils/tools.py
  55. 0 0
      NoteWork/python乱码识别/__init__.py
  56. 0 0
      NoteWork/文档/img.png
  57. 0 0
      NoteWork/文档/img_1.png
  58. 0 0
      NoteWork/文档/img_10.png
  59. 0 0
      NoteWork/文档/img_11.png
  60. 0 0
      NoteWork/文档/img_12.png
  61. 0 0
      NoteWork/文档/img_13.png
  62. 0 0
      NoteWork/文档/img_2.png
  63. 0 0
      NoteWork/文档/img_3.png
  64. 0 0
      NoteWork/文档/img_4.png
  65. 0 0
      NoteWork/文档/img_5.png
  66. 0 0
      NoteWork/文档/img_6.png
  67. 0 0
      NoteWork/文档/img_7.png
  68. 0 0
      NoteWork/文档/img_8.png
  69. 0 0
      NoteWork/文档/img_9.png
  70. 0 29
      NoteWork/文档/update.md
  71. 0 108
      NoteWork/文档/开发文档.md
  72. 1 2
      README.md

+ 15 - 2
FworkSpider/feapder/VERSION

@@ -7,8 +7,9 @@ Created on 2020/4/21 10:41 PM
 @author: Boris
 @email: boris_liu@foxmail.com
 """
-import os, sys
+import os
 import re
+import sys
 
 sys.path.insert(0, re.sub(r"([\\/]items$)|([\\/]spiders$)", "", os.getcwd()))
 
@@ -16,6 +17,10 @@ __all__ = [
     "AirSpider",
     "Spider",
     "BatchSpider",
+    "BiddingListSpider",
+    "BiddingDetailSpider",
+    "PlanToBuildListSpider",
+    "PlanToBuildDetailSpider",
     "BaseParser",
     "BatchParser",
     "Request",
@@ -25,7 +30,15 @@ __all__ = [
     "ArgumentParser",
 ]
 
-from feapder.core.spiders import Spider, BatchSpider, AirSpider
+from feapder.core.spiders import (
+    Spider,
+    BatchSpider,
+    AirSpider,
+    BiddingListSpider,
+    BiddingDetailSpider,
+    PlanToBuildListSpider,
+    PlanToBuildDetailSpider,
+)
 from feapder.core.base_parser import BaseParser, BatchParser
 from feapder.network.request import Request
 from feapder.network.response import Response

+ 29 - 33
FworkSpider/feapder/buffer/__init__.py

@@ -43,7 +43,7 @@ class ItemBuffer(threading.Thread):
 
             self._items_queue = Queue(maxsize=MAX_ITEM_COUNT)
 
-            self._table_request = setting.TAB_REQUSETS.format(redis_key=redis_key)
+            self._table_request = setting.TAB_REQUESTS.format(redis_key=redis_key)
             self._table_failed_items = setting.TAB_FAILED_ITEMS.format(
                 redis_key=redis_key
             )
@@ -99,9 +99,9 @@ class ItemBuffer(threading.Thread):
 
         return self._mysql_pipeline
 
-    def run(self): # step 1 开始
+    def run(self):
         self._thread_stop = False
-        while not self._thread_stop: # 爬虫不停止,就一直循环刷新
+        while not self._thread_stop:
             self.flush()
             tools.delay_time(1)
 
@@ -111,18 +111,14 @@ class ItemBuffer(threading.Thread):
         self._thread_stop = True
         self._started.clear()
 
-    def put_item(self, item): # step 存储数据的入口 将需要存储的数据放入数据管道队列
+    def put_item(self, item):
         if isinstance(item, Item):
             # 入库前的回调
-
-            if item.item_name == "ListItem":  # 测试框架有用,对listitem不进行存储,正式框架没有这个判断
-                return
             item.pre_to_db()
-            # print(item)
-            if item.save: # 根据save字段,判断该条信息是否存储
-                self._items_queue.put(item)
-        else:
+
+        if getattr(item, "save", True):  # save=False 不推送入库
             self._items_queue.put(item)
+
     def flush(self):
         try:
             items = []
@@ -132,26 +128,26 @@ class ItemBuffer(threading.Thread):
             items_fingerprints = []
             data_count = 0
 
-            while not self._items_queue.empty(): # step 2 数据管道队列不为空时时 不等待直接取值
-                data = self._items_queue.get_nowait() # 队列的 不等待直接取值方法,类似get
+            while not self._items_queue.empty():
+                data = self._items_queue.get_nowait()
                 data_count += 1
 
                 # data 分类
                 if callable(data):
                     callbacks.append(data)
 
-                elif isinstance(data, UpdateItem): # 更新型数据,走更新管道,采集框架只存不更新,可以忽略不看
+                elif isinstance(data, UpdateItem):
                     update_items.append(data)
 
                 elif isinstance(data, Item):
                     items.append(data)
-                    if setting.ITEM_FILTER_ENABLE: # item去重,对于当前框架,无效,不看
+                    if setting.ITEM_FILTER_ENABLE:
                         items_fingerprints.append(data.fingerprint)
 
                 else:  # request-redis
                     requests.append(data)
 
-                if data_count >= UPLOAD_BATCH_MAX_SIZE: # step 3 需要存储的数据,达到一定数量后,统一存储
+                if data_count >= UPLOAD_BATCH_MAX_SIZE:
                     self.__add_item_to_db(
                         items, update_items, requests, callbacks, items_fingerprints
                     )
@@ -163,7 +159,7 @@ class ItemBuffer(threading.Thread):
                     items_fingerprints = []
                     data_count = 0
 
-            if data_count: # step 3 管道为空后,将剩余的数据,统一存储
+            if data_count:
                 self.__add_item_to_db(
                     items, update_items, requests, callbacks, items_fingerprints
                 )
@@ -248,11 +244,8 @@ class ItemBuffer(threading.Thread):
         return datas_dict
 
     def __export_to_db(self, table, datas, is_update=False, update_keys=()):
-        # step 3.1.1 打点 记录总条数及每个key情况
-        self.check_datas(table=table, datas=datas)
-
-        for pipeline in self._pipelines: # setting 配置的piplines方法
-            if is_update: # 更新方法 不看
+        for pipeline in self._pipelines:
+            if is_update:
                 if table == self._task_table and not isinstance(
                     pipeline, MysqlPipeline
                 ):
@@ -265,7 +258,7 @@ class ItemBuffer(threading.Thread):
                     return False
 
             else:
-                if not pipeline.save_items(table, datas): # step 3.1.2 调用pipline的 save_items 方法
+                if not pipeline.save_items(table, datas):
                     log.error(
                         f"{pipeline.__class__.__name__} 保存数据失败. table: {table}  items: {datas}"
                     )
@@ -281,19 +274,22 @@ class ItemBuffer(threading.Thread):
                 )
                 return False
 
+        self.metric_datas(table=table, datas=datas)
         return True
 
+    def export_to_db(self, table, datas, **kwargs):
+        return self.__export_to_db(table, datas, **kwargs)
+
     def __add_item_to_db(
         self, items, update_items, requests, callbacks, items_fingerprints
     ):
         export_success = True
         self._is_adding_to_db = True
 
-        # 去重 item去重,不看
         if setting.ITEM_FILTER_ENABLE:
             items, items_fingerprints = self.__dedup_items(items, items_fingerprints)
 
-        # step 分捡 将每个表之间的数据分开 拆分后 原items为空
+        # 分捡
         items_dict = self.__pick_items(items)
         update_items_dict = self.__pick_items(update_items, is_update_item=True)
 
@@ -311,7 +307,7 @@ class ItemBuffer(threading.Thread):
                 % (table, tools.dumps_json(datas, indent=16))
             )
 
-            if not self.__export_to_db(table, datas): # step 3.1 导出到数据库
+            if not self.__export_to_db(table, datas):
                 export_success = False
                 failed_items["add"].append({"table": table, "datas": datas})
 
@@ -336,7 +332,7 @@ class ItemBuffer(threading.Thread):
                 failed_items["update"].append({"table": table, "datas": datas})
 
         if export_success:
-            # step 3.2 保存成功后,执行的执行回调
+            # 执行回调
             while callbacks:
                 try:
                     callback = callbacks.pop(0)
@@ -344,17 +340,15 @@ class ItemBuffer(threading.Thread):
                 except Exception as e:
                     log.exception(e)
 
-            # step 删除做过的request
+            # 删除做过的request
             if requests:
                 self.redis_db.zrem(self._table_request, requests)
 
-            # 去重入库 不走这个去重
+            # 去重入库
             if setting.ITEM_FILTER_ENABLE:
                 if items_fingerprints:
                     self.__class__.dedup.add(items_fingerprints, skip_check=True)
         else:
-            # step 3.2 保存失败后,执行的执行回调
-
             failed_items["requests"] = requests
 
             if self.export_retry_times > setting.EXPORT_DATA_MAX_RETRY_TIMES:
@@ -412,17 +406,19 @@ class ItemBuffer(threading.Thread):
 
         self._is_adding_to_db = False
 
-    def check_datas(self, table, datas):
+    def metric_datas(self, table, datas):
         """
         打点 记录总条数及每个key情况
         @param table: 表名
         @param datas: 数据 列表
         @return:
         """
-        metrics.emit_counter("total count", len(datas), classify=table)
+        total_count = 0
         for data in datas:
+            total_count += 1
             for k, v in data.items():
                 metrics.emit_counter(k, int(bool(v)), classify=table)
+        metrics.emit_counter("total count", total_count, classify=table)
 
     def close(self):
         # 调用pipeline的close方法

+ 5 - 5
FworkSpider/feapder/buffer/request_buffer.py

@@ -34,8 +34,8 @@ class RequestBuffer(threading.Thread):
             self._del_requests_deque = collections.deque()
             self._db = RedisDB()
 
-            self._table_request = setting.TAB_REQUSETS.format(redis_key=redis_key)
-            self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
+            self._table_request = setting.TAB_REQUESTS.format(redis_key=redis_key)
+            self._table_failed_request = setting.TAB_FAILED_REQUESTS.format(
                 redis_key=redis_key
             )
 
@@ -44,9 +44,9 @@ class RequestBuffer(threading.Thread):
                     name=redis_key, to_md5=False, **setting.REQUEST_FILTER_SETTING
                 )  # 默认过期时间为一个月
 
-    def run(self): # step 1 线程入口
+    def run(self):
         self._thread_stop = False
-        while not self._thread_stop: # 每隔一分钟进行一次 将产生的任务存储
+        while not self._thread_stop:
             try:
                 self.__add_request_to_db()
             except Exception as e:
@@ -94,7 +94,7 @@ class RequestBuffer(threading.Thread):
         callbacks = []
 
         while self._requests_deque:
-            request = self._requests_deque.popleft() # 从任务队列中从左取任务(先进先出)
+            request = self._requests_deque.popleft()
             self._is_adding_to_db = True
 
             if callable(request):

+ 14 - 8
FworkSpider/feapder/commands/create/create_spider.py

@@ -16,10 +16,10 @@ import feapder.utils.tools as tools
 from .create_init import CreateInit
 
 
-def deal_file_info(file):
+def deal_file_info(file, author):
     file = file.replace("{DATE}", tools.get_current_date())
-    file = file.replace("{USER}", getpass.getuser())
-
+    # file = file.replace("{USER}", getpass.getuser())
+    file = file.replace("{USER}", author)
     return file
 
 
@@ -57,8 +57,14 @@ class CreateSpider:
             template_path = "batch_spider_template.tmpl"
         elif spider_type == 4:
             template_path = "spider_list_template.tmpl"
+        elif spider_type == 5:
+            template_path = "detail_template.tmpl"
+        elif spider_type == 6:
+            template_path = "njpc_list_template.tmpl"
+        elif spider_type == 7:
+            template_path = "njpc_detail_template.tmpl"
         else:
-            raise ValueError("spider type error, support 1 2 3")
+            raise ValueError("spider type error, support 1 2 3 4 5 6 7")
 
         template_path = os.path.abspath(
             os.path.join(__file__, "../../../templates", template_path)
@@ -68,9 +74,9 @@ class CreateSpider:
 
         return spider_template
 
-    def create_spider(self, spider_template, spider_name):
+    def create_spider(self, spider_template, spider_name, author):
         spider_template = spider_template.replace("${spider_name}", spider_name)
-        spider_template = deal_file_info(spider_template)
+        spider_template = deal_file_info(spider_template, author)
         return spider_template
 
     def save_spider_to_file(self, spider, spider_name):
@@ -89,7 +95,7 @@ class CreateSpider:
 
         self._create_init.create()
 
-    def create(self, spider_name, spider_type):
+    def create(self, spider_name, spider_type, author):
         # 检查spider_name
         if not re.search("^[a-zA-Z][a-zA-Z0-9_]*$", spider_name):
             raise Exception("爬虫名不符合命名规范,请用下划线命名或驼峰命名方式")
@@ -97,5 +103,5 @@ class CreateSpider:
         if spider_name.islower():
             spider_name = tools.key2hump(spider_name)
         spider_template = self.get_spider_template(spider_type)
-        spider = self.create_spider(spider_template, spider_name)
+        spider = self.create_spider(spider_template, spider_name, author)
         self.save_spider_to_file(spider, spider_name)

+ 5 - 5
FworkSpider/feapder/commands/shell.py

@@ -58,13 +58,13 @@ def fetch_curl(curl_args):
 
 def usage():
     """
-下载调试器
+    下载调试器
 
-usage: feapder shell [options] [args]
+    usage: feapder shell [options] [args]
 
-optional arguments:
-  -u, --url     抓取指定url
-  -c, --curl    抓取curl格式的请求
+    optional arguments:
+      -u, --url     抓取指定url
+      -c, --curl    抓取curl格式的请求
 
     """
     print(usage.__doc__)

+ 10 - 64
FworkSpider/feapder/core/__init__.py

@@ -8,15 +8,11 @@ Created on 2018-07-25 11:41:57
 @email:  boris_liu@foxmail.com
 """
 import os
-import traceback
 
-import feapder
 import feapder.utils.tools as tools
 from feapder.db.mysqldb import MysqlDB
 from feapder.network.item import UpdateItem
 from feapder.utils.log import log
-from feapder.utils.aliyun import UploadOSS
-from feapder.db.redisdb import RedisDB
 
 
 class BaseParser(object):
@@ -30,6 +26,16 @@ class BaseParser(object):
 
         pass
 
+        """
+        @summary: 下载中间件 可修改请求的一些参数, 或可自定义下载,然后返回 request, response
+        ---------
+        @param request:
+        ---------
+        @result: return request / request, response
+        """
+
+        pass
+
     def download_midware(self, request):
         """
         @summary: 下载中间件 可修改请求的一些参数, 或可自定义下载,然后返回 request, response
@@ -91,66 +97,6 @@ class BaseParser(object):
         """
 
         pass
-    def infinite_crawl(self,request,response):
-        menu = request.item
-        list_item = request.list_item
-        if self.platform_next_page:  # real_page为连续翻页采集为0
-            if getattr(request, 'real_page', None) is not None:
-                request.real_page = 0
-
-            request.real_page += 1
-            if list_item.rel_count > 0:
-                request.real_page = 0
-
-            if request.real_page <= 5 and request.page < self.platform_max_page:
-                request.page += 1
-                request.callback = self.parse
-                if getattr(request, 'new_callback', None) is not None:
-                    request.callback = eval(request.new_callback)
-                    yield request
-        else:
-            if request.page < menu.get("crawl_page"):
-                request.page += 1
-                request.callback = self.parse
-                if getattr(request, 'new_callback', None) is not None:
-                    request.callback = eval(request.new_callback)
-                    yield request
-
-    def push_files(self, request, response):
-        """
-        @summary: 下载 并上传附件文件,传进来的request的auto_request必须为False,否则可能会因为响应失败而无法下载文件
-        ---------
-        @param request:  request.url 为文件下载地址, 该方法需要自行调用
-        request.INFO  为上传文件时所需要提供的部分参数  必传
-         info = {
-            "org_url": "http://www...",  # 文件下载连接
-            "filename": f"{list_item.title}.docx",  # 文件名
-            "channel": list_item.channel,
-            "ftype": 'docx,zip,ftp', # 文件类型
-        }
-        request.headers 则存放请求的必要参数,如:parmas,headers  必传
-        ---------
-        @result: request / item / callback / None (返回值必须可迭代),正常处理为 None 即可
-        """
-        list_item = request.item
-        res = None
-        for i in range(5):
-            try:
-                parameter = request.parameter
-                res = UploadOSS().get_state(request.info,**parameter)
-            except:
-                log.error(traceback.format_exc())
-            if res is not None:
-                list_item.projectinfo = res
-                yield list_item
-                log.info(f"{res.get('filename')}附件下载完成,大小为:{res.get('size')},fid为:{res.get('fid')}")
-                return
-            else:
-                log.error(f"{res.get('filename')}附件下载失败,失败连接为:{res.get('org_url')}")
-        if res is None:
-            _db = RedisDB()
-            request_dict = request.to_dict
-            _db.zadd("forwork:files_failed", request_dict)
 
     def start_callback(self):
         """

+ 42 - 25
FworkSpider/feapder/core/collector.py

@@ -7,10 +7,9 @@ Created on 2016-12-23 11:24
 @author: Boris
 @email: boris_liu@foxmail.com
 """
-
-import collections
 import threading
 import time
+from queue import Queue, Empty
 
 import feapder.setting as setting
 import feapder.utils.tools as tools
@@ -20,6 +19,7 @@ from feapder.utils.log import log
 
 
 class Collector(threading.Thread):
+
     def __init__(self, redis_key):
         """
         @summary:
@@ -34,9 +34,9 @@ class Collector(threading.Thread):
 
         self._thread_stop = False
 
-        self._todo_requests = collections.deque()
+        self._todo_requests = Queue(maxsize=setting.COLLECTOR_TASK_COUNT)
 
-        self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key)
+        self._tab_requests = setting.TAB_REQUESTS.format(redis_key=redis_key)
         self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
 
         self._spider_mark = tools.get_localhost_ip() + f"-{time.time()}"
@@ -52,7 +52,7 @@ class Collector(threading.Thread):
         self._thread_stop = False
         while not self._thread_stop:
             try:
-                self.__report_node_heartbeat() # step 汇报节点心跳
+                self.__report_node_heartbeat()  # step 汇报节点心跳
                 self.__input_data()
             except Exception as e:
                 log.exception(e)
@@ -66,25 +66,29 @@ class Collector(threading.Thread):
         self._started.clear()
 
     def __input_data(self):
-        current_timestamp = tools.get_current_timestamp()
-        if len(self._todo_requests) >= self._request_count: # step 待执行任务数量>设置的任务数量上限 不处理
+        if self._request_count / setting.SPIDER_THREAD_COUNT > 1 and (
+            self._todo_requests.qsize() > setting.SPIDER_THREAD_COUNT
+            or self._todo_requests.qsize() >= self._todo_requests.maxsize
+        ):  # 当任务总数大于线程数 且 内存队列持有任务总数大于线程数 此时不添加任务
+            time.sleep(0.1)
             return
 
+        current_timestamp = tools.get_current_timestamp()
+
         request_count = self._request_count  # 先赋值
-        # step 查询最近有心跳的节点数量
+        # 查询最近有心跳的节点数量
         spider_count = self._db.zget_count(
             self._tab_spider_status,
             priority_min=current_timestamp - (self._interval + 10),
             priority_max=current_timestamp,
         )
-        # step 根据等待节点数量,动态分配request
+        # 根据等待节点数量,动态分配request
         if spider_count:
             # 任务数量
             task_count = self._db.zget_count(self._tab_requests)
             # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1
             request_count = task_count // spider_count + 1
 
-        # step 判断 request_count 数量是否大于 设置的上限 ,大于上限,重置
         request_count = (
             request_count
             if request_count <= self._request_count
@@ -108,7 +112,7 @@ class Collector(threading.Thread):
             if lose_count:
                 log.info("重置丢失任务完毕,共{}条".format(len(datas)))
 
-        # 取任务,只取当前时间搓以内的任务,同时将任务分数修改为 current_timestamp + setting.REQUEST_LOST_TIMEOUT
+        # 取任务,只取当前时间搓以内的任务,同时将取走的任务分数修改为 current_timestamp + setting.REQUEST_LOST_TIMEOUT
         requests_list = self._db.zrangebyscore_set_score(
             self._tab_requests,
             priority_min="-inf",
@@ -117,10 +121,14 @@ class Collector(threading.Thread):
             count=request_count,
         )
 
+        log.debug("领取新任务完毕,共{}条".format(len(requests_list)))
+
         if requests_list:
             self._is_collector_task = True
             # 存request
             self.__put_requests(requests_list)
+        else:
+            time.sleep(0.1)
 
     def __report_node_heartbeat(self):
         """
@@ -150,28 +158,37 @@ class Collector(threading.Thread):
             except Exception as e:
                 log.exception(
                     """
-                error %s
-                request %s
-                """
+                    error %s
+                    request %s
+                    """
                     % (e, request)
                 )
-
                 request_dict = None
 
             if request_dict:
-                self._todo_requests.append(request_dict)
-
-    def get_requests(self, count):
-        requests = []
-        count = count if count <= len(self._todo_requests) else len(self._todo_requests)
-        while count:
-            requests.append(self._todo_requests.popleft())
-            count -= 1
+                self._todo_requests.put(request_dict)
 
-        return requests
+    def get_request(self):
+        try:
+            request = self._todo_requests.get(timeout=1)
+            return request
+        except Empty as e:
+            return None
 
     def get_requests_count(self):
-        return len(self._todo_requests) or self._db.zget_count(self._tab_requests) or 0
+        return (
+            self._todo_requests.qsize() or self._db.zget_count(self._tab_requests) or 0
+        )
 
     def is_collector_task(self):
         return self._is_collector_task
+
+    def get_spider_count(self):
+        return self._db.zget_count(
+            self._tab_spider_status,
+            priority_min=tools.get_current_timestamp() - (self._interval + 10),
+            priority_max=tools.get_current_timestamp(),
+        )
+
+    def delete_spider_node(self):
+        self._db.zrem(self._tab_spider_status, self._spider_mark)

+ 1 - 1
FworkSpider/feapder/core/handle_failed_requests.py

@@ -24,7 +24,7 @@ class HandleFailedRequests(object):
         self._redisdb = RedisDB()
         self._request_buffer = RequestBuffer(self._redis_key)
 
-        self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
+        self._table_failed_request = setting.TAB_FAILED_REQUESTS.format(
             redis_key=redis_key
         )
 

文件差異過大導致無法顯示
+ 499 - 453
FworkSpider/feapder/core/parser_control.py


+ 109 - 97
FworkSpider/feapder/core/scheduler.py

@@ -13,29 +13,34 @@ import threading
 import time
 from collections import Iterable
 
-
 import feapder.setting as setting
 import feapder.utils.tools as tools
 from feapder.buffer.item_buffer import ItemBuffer
 from feapder.buffer.request_buffer import RequestBuffer
 from feapder.core.base_parser import BaseParser
 from feapder.core.collector import Collector
+from feapder.core.handle_failed_items import HandleFailedItems
 from feapder.core.handle_failed_requests import HandleFailedRequests
 from feapder.core.parser_control import PaserControl
 from feapder.db.redisdb import RedisDB
 from feapder.network.item import Item
 from feapder.network.request import Request
+from feapder.utils import metrics
 from feapder.utils.log import log
 from feapder.utils.redis_lock import RedisLock
-from feapder.utils import metrics
 
-SPIDER_START_TIME_KEY = "spider_start_time"
+SPIDER_UUID = tools.get_uuid()
+SPIDER_START_TIME = "spider_start_time"
+SPIDER_START_TIME_KEY = SPIDER_START_TIME + "#" + SPIDER_UUID
 SPIDER_END_TIME_KEY = "spider_end_time"
 SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY = "last_task_count_record_time"
 
+
 class Obj(object):
     def __init__(self, dict_):
         self.__dict__.update(dict_)
+
+
 class Scheduler(threading.Thread):
     __custom_setting__ = {}
 
@@ -78,7 +83,15 @@ class Scheduler(threading.Thread):
             else:
                 setattr(setting, key, value)
         
-
+        # 历史爬虫[redis_key]
+        for item in sys.argv[1:]:
+            if item.startswith("--purpose"):
+                val = item.split('=')[-1]
+                if not redis_key.endswith(val):
+                    # 历史爬虫需要单独的redis_key,防止增量爬虫
+                    # 与历史爬虫共用同一个redis_key,出现增量爬虫断点续采的情况
+                    redis_key += f'_{val}'
+                    
         self._redis_key = redis_key or setting.REDIS_KEY
         if not self._redis_key:
             raise Exception(
@@ -129,11 +142,12 @@ class Scheduler(threading.Thread):
 
         self._spider_name = redis_key
         self._project_name = redis_key.split(":")[0]
+        self._task_table = task_table
 
         self._tab_spider_time = setting.TAB_SPIDER_TIME.format(redis_key=redis_key)
         self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
-        self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key)
-        self._tab_failed_requests = setting.TAB_FAILED_REQUSETS.format(
+        self._tab_requests = setting.TAB_REQUESTS.format(redis_key=redis_key)
+        self._tab_failed_requests = setting.TAB_FAILED_REQUESTS.format(
             redis_key=redis_key
         )
 
@@ -171,23 +185,16 @@ class Scheduler(threading.Thread):
             raise ValueError("类型错误,爬虫需继承feapder.BaseParser或feapder.BatchParser")
 
     def run(self):  # STEP 1 爬虫框架入口
-        if not self.is_reach_next_spider_time(): # STEP 2 检测爬虫是否到达执行时间
+        if not self.is_reach_next_spider_time():  # STEP 2 检测爬虫是否到达执行时间
             return
 
-        self._start() # STEP 3 开始运行爬虫
+        self._start()  # STEP 3 开始运行爬虫
 
-        while True: # step 4 对爬虫状态的一个监控
+        while True:  # step 4 对爬虫状态的一个监控
             try:
                 if self.all_thread_is_done(): # Step 5 判断爬虫是否运行完成
                     if not self._is_notify_end:
                         self.spider_end()  # 跑完一轮
-                        self.record_spider_state(  # step 6 应该是一个通知爬虫结束的方法
-                            spider_type=1,
-                            state=1,
-                            spider_end_time=tools.get_current_date(),
-                            batch_interval=self._batch_interval,
-                        )
-
                         self._is_notify_end = True
 
                     if not self._keep_alive: # step 7 如果不是常驻爬虫 停止所有线程
@@ -197,7 +204,7 @@ class Scheduler(threading.Thread):
                 else:
                     self._is_notify_end = False
 
-                self.check_task_status() # step 8 检查任务状态,并进行告警通知
+                self.check_task_status()  # step 8 检查任务状态,并进行告警通知
 
             except Exception as e:
                 log.exception(e)
@@ -207,15 +214,8 @@ class Scheduler(threading.Thread):
     def __add_task(self):
         # 启动parser 的 start_requests
         self.spider_begin()  # 不自动结束的爬虫此处只能执行一遍
-        self.record_spider_state(
-            spider_type=1,
-            state=0,
-            batch_date=tools.get_current_date(),
-            spider_start_time=tools.get_current_date(),
-            batch_interval=self._batch_interval,
-        )
 
-        # 判断任务池中属否还有任务,若有接着抓取
+        # 判断任务池中属否还有任务,若有接着抓取,若无则生产新任务
         todo_task_count = self._collector.get_requests_count()
         if todo_task_count:
             log.info("检查到有待做任务 %s 条,不重下发新任务,将接着上回异常终止处继续抓取" % todo_task_count)
@@ -227,17 +227,17 @@ class Scheduler(threading.Thread):
                     raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
 
                 result_type = 1
-                for result in results or []: # step 对yield 的数据进行判断处理
-                    if isinstance(result, Request): # Request 加入到任务队列
+                for result in results or []:  # step 对yield 的数据进行判断处理
+                    if isinstance(result, Request):  # Request 加入到任务队列
                         result.parser_name = result.parser_name or parser.name
                         self._request_buffer.put_request(result)
                         result_type = 1
 
-                    elif isinstance(result, Item): # Item 数据,存入到数据管道队列,等待存储
+                    elif isinstance(result, Item):  # Item 数据,存入到数据管道队列,等待存储
                         self._item_buffer.put_item(result)
                         result_type = 2
 
-                    elif callable(result):  # callbale的request可能是更新数据库操作的函数
+                    elif callable(result):  # callable  request 可能是更新数据库操作的函数
                         if result_type == 1:
                             self._request_buffer.put_request(result)
                         else:
@@ -253,12 +253,21 @@ class Scheduler(threading.Thread):
                 self._item_buffer.flush()
 
     def _start(self):
+        # 将失败的item入库
+        if setting.RETRY_FAILED_ITEMS:
+            handle_failed_items = HandleFailedItems(
+                redis_key=self._redis_key,
+                task_table=self._task_table,
+                item_buffer=self._item_buffer,
+            )
+            handle_failed_items.reput_failed_items_to_db()
 
-        self._request_buffer.start()  # STEP 3.1 启动request_buffer -- 任务管理器, 负责缓冲添加到数据库中的request
-
-        self._item_buffer.start()  # STEP 3.2 启动item_buffer -- 管道管理器 责缓冲添加到数据库中的item, 由该manager统一添加。防止多线程同时访问数据库
-
-        self._collector.start()  # STEP 3.3 启动collector  -- 任务管理 ,根据节点和任务,平均分配给每个节点
+        # STEP 3.1 启动request_buffer -- 任务管理器, 负责缓冲添加到数据库中的request
+        self._request_buffer.start()
+        # STEP 3.2 启动item_buffer -- 管道管理器 责缓冲添加到数据库中的item, 由该manager统一添加。防止多线程同时访问数据库
+        self._item_buffer.start()
+        # STEP 3.3 启动collector  -- 任务管理 ,根据节点和任务,平均分配给每个节点
+        self._collector.start()
 
         # 启动parser control
         for i in range(self._thread_count):
@@ -293,7 +302,8 @@ class Scheduler(threading.Thread):
                 self.__add_task()
 
     def all_thread_is_done(self):
-        for i in range(3):  # Stress 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
+        # Stress 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
+        for i in range(3):
             # STEP 5.1 检测 collector 状态
             if (
                 self._collector.is_collector_task()
@@ -320,7 +330,7 @@ class Scheduler(threading.Thread):
             ):
                 return False
 
-            tools.delay_time(1) # 休眠一分钟
+            tools.delay_time(1)  # 休眠1秒
 
         return True
 
@@ -336,6 +346,40 @@ class Scheduler(threading.Thread):
         else:
             return
 
+        # 检查失败任务数量 超过1000 报警,
+        failed_count = self._redisdb.zget_count(self._tab_failed_requests)
+        print('<<<<<<<<<<<<<<<<<<<<<<<<<<<< 失败次数:', failed_count)
+        if failed_count > setting.WARNING_FAILED_COUNT:
+            # 发送报警
+            msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count)
+            log.error(msg)
+            self.send_msg(
+                msg,
+                level="error",
+                message_prefix="《%s》爬虫当前失败任务数报警" % (self._spider_name),
+            )
+
+        # parser_control实时统计已做任务数及失败任务数,若成功率<0.5 则报警
+        failed_task_count, success_task_count = PaserControl.get_task_status_count()
+        total_count = success_task_count + failed_task_count
+        if total_count > 0:
+            task_success_rate = success_task_count / total_count
+            if task_success_rate < 0.5:
+                # 发送报警
+                msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % (
+                    self._spider_name,
+                    success_task_count,
+                    failed_task_count,
+                    task_success_rate,
+                )
+                log.error(msg)
+                self.send_msg(
+                    msg,
+                    level="error",
+                    message_prefix="《%s》爬虫当前任务成功率报警" % (self._spider_name),
+                )
+
+        # 判断任务数是否变化
         # step 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息
         task_count = self._redisdb.zget_count(self._tab_requests)
 
@@ -346,7 +390,7 @@ class Scheduler(threading.Thread):
                     self._tab_spider_time,
                     SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
                     tools.get_current_timestamp(),
-                )  # 多进程会重复发消息, 使用reids记录上次统计时间
+                )  # 多进程会重复发消息, 使用redis记录上次统计时间
             else:
                 # step 判断时间间隔是否超过20分钟
                 lua = """
@@ -357,7 +401,8 @@ class Scheduler(threading.Thread):
                     -- 取值
                     local last_timestamp = redis.call('hget', KEYS[1], field)
                     if last_timestamp and current_timestamp - last_timestamp >= 1200 then
-                        return current_timestamp - last_timestamp -- 返回任务停滞时间 秒
+                        -- 返回任务停滞时间 秒
+                        return current_timestamp - last_timestamp 
                     end
 
                     if not last_timestamp then
@@ -382,49 +427,15 @@ class Scheduler(threading.Thread):
                     msg = "{}  爬虫任务停滞 {},请检查爬虫是否正常".format(
                         self._spider_name, tools.format_seconds(overtime)
                     )
-                    log.error(msg)  # TODO 这一步可以加一个print,在平台的日志框里输出
+                    log.error(msg) # TODO 这一步可以加一个print,在平台的日志框里输出
                     self.send_msg(
                         msg,
                         level="error",
                         message_prefix="《{}》爬虫任务停滞".format(self._spider_name),
                     )
-
         else:
             self._last_task_count = 0
 
-        # 检查失败任务数量 超过1000 报警,
-        failed_count = self._redisdb.zget_count(self._tab_failed_requests)
-        print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<失败次数:',failed_count)
-        if failed_count > setting.WARNING_FAILED_COUNT:
-            # 发送报警
-            msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count)
-            log.error(msg)
-            self.send_msg(
-                msg,
-                level="error",
-                message_prefix="《%s》爬虫当前失败任务数报警" % (self._spider_name),
-            )
-
-        # parser_control实时统计已做任务数及失败任务数,若成功率<0.5 则报警
-        failed_task_count, success_task_count = PaserControl.get_task_status_count()
-        total_count = success_task_count + failed_task_count
-        if total_count > 0:
-            task_success_rate = success_task_count / total_count
-            if task_success_rate < 0.5:
-                # 发送报警
-                msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % (
-                    self._spider_name,
-                    success_task_count,
-                    failed_task_count,
-                    task_success_rate,
-                )
-                log.error(msg)
-                self.send_msg(
-                    msg,
-                    level="error",
-                    message_prefix="《%s》爬虫当前任务成功率报警" % (self._spider_name),
-                )
-
         # 检查入库失败次数
         if self._item_buffer.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
             msg = "《{}》爬虫导出数据失败,失败次数:{}, 请检查爬虫是否正常".format(
@@ -450,6 +461,11 @@ class Scheduler(threading.Thread):
                 if table != self._tab_spider_time:
                     log.info("正在删除key %s" % table)
                     redis.clear(table)
+                else:
+                    keys = redis.hgetall(table)
+                    for key in keys:
+                        if key.startswith(SPIDER_START_TIME):
+                            redis.hdel(table, key)
 
     def _stop_all_thread(self):
         self._request_buffer.stop()
@@ -472,9 +488,12 @@ class Scheduler(threading.Thread):
     def get_argvs(self):
         argvs = {"next_page": False, "max_page": 10}
         for item in sys.argv[1:]:
-            print(item)
+            # print(item)
             if item.startswith("--"):
-                argvs[item.replace("--", "").split('=')[0]] = eval(item.split('=')[-1]) # 此处使用eval的原因是字符串转bool或int
+                key = item.replace("--", "").split('=')[0]
+                val = item.split('=')[-1]
+                if key != 'purpose':
+                    argvs[key] = eval(val)  # 此处使用eval的原因是字符串转bool或int
         return json.loads(json.dumps(argvs), object_hook=Obj)
 
     def spider_begin(self):
@@ -489,8 +508,9 @@ class Scheduler(threading.Thread):
             self._begin_callback()
 
         for parser in self._parsers:
-            parser.platform_next_page = self.get_argvs().next_page
-            parser.platform_max_page = self.get_argvs().max_page
+            parameter = self.get_argvs()
+            parser.platform_next_page = parameter.next_page
+            parser.platform_max_page = parameter.max_page
             parser.start_callback()
 
         # 记录开始时间
@@ -503,7 +523,7 @@ class Scheduler(threading.Thread):
             # 发送消息
             # self.send_msg("《%s》爬虫开始" % self._spider_name)
 
-    def spider_end(self): # step end 爬虫结束时的一些操作
+    def spider_end(self):  # step end 爬虫结束时的一些操作
         self.record_end_time()
 
         if self._end_callback:  # 系统自带的回调,如果自定义回调,则这个回调不会执行
@@ -511,8 +531,8 @@ class Scheduler(threading.Thread):
 
         for parser in self._parsers:
             if not self._keep_alive:
-                parser.close() # 爬虫可自定义close
-            parser.end_callback() # 调用结束回调函数,可在爬虫自定义
+                parser.close()  # 爬虫可自定义close
+            parser.end_callback()  # 调用结束回调函数,可在爬虫自定义
 
         if not self._keep_alive:
             # 关闭webdirver
@@ -530,21 +550,24 @@ class Scheduler(threading.Thread):
         )
         if data:
             begin_timestamp = int(data)
-
-            spand_time = tools.get_current_timestamp() - begin_timestamp
-
+            elapsed_time = tools.get_current_timestamp() - begin_timestamp
             msg = "《%s》爬虫结束,耗时 %s" % (
                 self._spider_name,
-                tools.format_seconds(spand_time),
+                tools.format_seconds(elapsed_time),
             )
             log.info(msg)
 
             # self.send_msg(msg)
 
         if self._keep_alive:
-            log.info("爬虫不自动结束, 等待下一轮任务...")
+            log.info("爬虫不自动结束,等待下一轮任务...")
         else:
-            self.delete_tables(self._tab_spider_status)
+            if self._collector.get_spider_count() <= 1:
+                self.delete_tables(self._tab_spider_time)
+                self.delete_tables(self._tab_spider_status)
+            else:
+                # 清除关闭爬虫的心跳记录,防止删除任务共享表,造成爬虫异常僵死
+                self._collector.delete_spider_node()
 
     def record_end_time(self):
         # 记录结束时间
@@ -578,17 +601,6 @@ class Scheduler(threading.Thread):
 
         return True
 
-    def record_spider_state(
-        self,
-        spider_type,
-        state,
-        batch_date=None,
-        spider_start_time=None,
-        spider_end_time=None,
-        batch_interval=None,
-    ):
-        pass
-
     def join(self, timeout=None):
         """
         重写线程的join

+ 16 - 2
FworkSpider/feapder/core/spiders/__init__.py

@@ -8,8 +8,22 @@ Created on 2020/4/22 12:08 AM
 @email: boris_liu@foxmail.com
 """
 
-__all__ = ["AirSpider", "Spider", "BatchSpider"]
+__all__ = [
+    "AirSpider",
+    "Spider",
+    "BatchSpider",
+    "BiddingListSpider",
+    "BiddingDetailSpider",
+    "PlanToBuildListSpider",
+    "PlanToBuildDetailSpider",
+]
 
 from feapder.core.spiders.air_spider import AirSpider
-from feapder.core.spiders.spider import Spider
 from feapder.core.spiders.batch_spider import BatchSpider
+from feapder.core.spiders.spider import (
+    Spider,
+    BiddingListSpider,
+    BiddingDetailSpider,
+    PlanToBuildListSpider,
+    PlanToBuildDetailSpider
+)

+ 3 - 18
FworkSpider/feapder/core/spiders/air_spider.py

@@ -126,11 +126,11 @@ class BatchSpider(BatchParser, Scheduler):
         self._check_task_interval = check_task_interval
         self._task_limit = task_limit  # mysql中一次取的任务数量
         self._related_task_tables = [
-            setting.TAB_REQUSETS.format(redis_key=redis_key)
+            setting.TAB_REQUESTS.format(redis_key=redis_key)
         ]  # 自己的task表也需要检查是否有任务
         if related_redis_key:
             self._related_task_tables.append(
-                setting.TAB_REQUSETS.format(redis_key=related_redis_key)
+                setting.TAB_REQUESTS.format(redis_key=related_redis_key)
             )
 
         self._related_batch_record = related_batch_record
@@ -216,7 +216,7 @@ class BatchSpider(BatchParser, Scheduler):
                 is_first_check = False
 
                 # 检查redis中是否有任务 任务小于_min_task_count 则从mysql中取
-                tab_requests = setting.TAB_REQUSETS.format(redis_key=self._redis_key)
+                tab_requests = setting.TAB_REQUESTS.format(redis_key=self._redis_key)
                 todo_task_count = self._redisdb.zget_count(tab_requests)
 
                 tasks = []
@@ -922,13 +922,6 @@ class BatchSpider(BatchParser, Scheduler):
 
             # 爬虫开始
             self.spider_begin()
-            self.record_spider_state(
-                spider_type=2,
-                state=0,
-                batch_date=batch_date,
-                spider_start_time=tools.get_current_date(),
-                batch_interval=self._batch_interval,
-            )
         else:
             log.error("插入新批次失败")
 
@@ -1028,14 +1021,6 @@ class BatchSpider(BatchParser, Scheduler):
                     ):  # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况)
                         if not self._is_notify_end:
                             self.spider_end()
-                            self.record_spider_state(
-                                spider_type=2,
-                                state=1,
-                                batch_date=self._batch_date_cache,
-                                spider_end_time=tools.get_current_date(),
-                                batch_interval=self._batch_interval,
-                            )
-
                             self._is_notify_end = True
 
                         if not self._keep_alive:

+ 149 - 16
FworkSpider/feapder/core/spiders/spider.py

@@ -16,6 +16,7 @@ import feapder.setting as setting
 import feapder.utils.tools as tools
 from feapder.core.base_parser import BaseParser
 from feapder.core.scheduler import Scheduler
+from feapder.db.mongodb import MongoDB
 from feapder.db.redisdb import RedisDB
 from feapder.network.item import Item
 from feapder.network.request import Request
@@ -96,7 +97,7 @@ class Spider(
         while True:
             try:
                 # 检查redis中是否有任务
-                tab_requests = setting.TAB_REQUSETS.format(redis_key=self._redis_key)
+                tab_requests = setting.TAB_REQUESTS.format(redis_key=self._redis_key)
                 todo_task_count = redisdb.zget_count(tab_requests)
 
                 if todo_task_count < self._min_task_count:  # 添加任务
@@ -160,14 +161,6 @@ class Spider(
         if self._is_distributed_task:  # 有任务时才提示启动爬虫
             # begin
             self.spider_begin()
-            self.record_spider_state(
-                spider_type=1,
-                state=0,
-                batch_date=tools.get_current_date(),
-                spider_start_time=tools.get_current_date(),
-                batch_interval=self._batch_interval,
-            )
-
             # 重置已经提示无任务状态为False
             self._is_show_not_task = False
 
@@ -194,13 +187,6 @@ class Spider(
                 if self.all_thread_is_done():
                     if not self._is_notify_end:
                         self.spider_end()  # 跑完一轮
-                        self.record_spider_state(
-                            spider_type=1,
-                            state=1,
-                            spider_end_time=tools.get_current_date(),
-                            batch_interval=self._batch_interval,
-                        )
-
                         self._is_notify_end = True
 
                     if not self._keep_alive:
@@ -435,3 +421,150 @@ class DebugSpider(Spider):
             tools.delay_time(1)  # 1秒钟检查一次爬虫状态
 
         self.delete_tables([self._redis_key + "*"])
+
+
+class BusinessBaseListSpider(Spider):
+    """列表页爬虫事务基类"""
+
+    __business_type__ = "List"
+
+    def _increment_page_number(self, request):
+        """无限翻页 - 页码自增"""
+        if self.platform_next_page:
+            if getattr(request, 'real_page', None) is None:
+                request.real_page = 0  # real_page=连续翻页页码(真实入库数量=0)
+
+            request.real_page += 1
+
+            if request.rel_count > 0:
+                request.real_page = 0  # 当真实入库数量大于0,重置翻页记录
+                request.rel_count = 0  # 重置实际入库数量
+
+            if request.real_page <= 5 and request.page < self.platform_max_page:
+                request.page += 1
+                # 设置无限翻页回调方法,进行列表页解析处理
+                callback_parser = (
+                    request.callback
+                    if callable(request.callback)
+                    else self.parse
+                )
+                request.callback = callback_parser
+                yield request
+        else:
+            if request.page < int(request.item["crawl_page"]):
+                request.page += 1  # 采集页码自增
+                request.rel_count = 0  # 重置实际入库数量
+                # 设置无限翻页回调方法,进行列表页解析处理
+                callback_parser = (
+                    request.callback
+                    if callable(request.callback)
+                    else self.parse
+                )
+                request.callback = callback_parser
+                yield request
+
+    def infinite_pages(self, request, response):
+        """无限翻页"""
+        request_generator = self._increment_page_number(request)
+        try:
+            request = next(request_generator)
+            return request
+        except StopIteration:
+            pass
+
+
+class BusinessBaseDetailSpider(Spider):
+    """详情页爬虫事务基类"""
+
+    __business_type__ = "Detail"
+
+    __custom_setting__ = dict(
+        ITEM_FILTER_ENABLE=False
+    )
+
+    err_coll_name = "listdata_err"
+    _to_db = None
+
+    def get_tasks(self, query, limit=10, is_delete=True, **kwargs):
+        """
+        领取采集任务
+
+        :param dict query: 查询条件
+        :param limit: 结果数量
+        :param is_delete: 取走的任务是否删除
+        :param kwargs
+            更多参数 https://docs.mongodb.com/manual/reference/command/find/#command-fields
+
+        :return: dict
+        """
+        if "sort" not in kwargs:
+            kwargs.setdefault("sort", {"_id": -1})
+
+        cursor = self.to_db.find(coll_name=self.db_name, condition=query, limit=limit, **kwargs)
+        for task in cursor:
+            yield task
+
+            if is_delete:
+                self.to_db.delete(coll_name=self.db_name, condition=task)
+
+    @property
+    def to_db(self):
+        if not self._to_db:
+            self._to_db = MongoDB()
+        return self._to_db
+
+
+class BiddingListSpider(BusinessBaseListSpider):
+    """标讯列表页爬虫事务类"""
+
+    __business_type__ = "BiddingList"
+
+    pass
+
+
+class BiddingDetailSpider(BusinessBaseDetailSpider):
+    """标讯详情页爬虫事务类"""
+
+    __business_type__ = "BiddingDetail"
+    db_name = "mgp_list"
+
+    def failed_request(self, request, response):
+        """请求、解析错误次数超过上限后,将原信息重新保存至数据库,并修改failed字段"""
+        _data = request.base_info if isinstance(request.base_info, dict) else request.base_info.to_dict
+        item = Item(origin_data=_data)
+        item.table_name = self.err_coll_name
+        item.status_code = getattr(response, "status_code", -1)
+        item.err_reason = getattr(request, "error_msg", "")
+        item.err_requests = int(getattr(item, "err_requests", 0)) + 1
+        item.create_at = tools.ensure_int64(tools.get_current_timestamp())
+        item.origin = self.db_name
+        item.spidercode = _data["spidercode"]
+        yield item
+
+
+class PlanToBuildListSpider(BusinessBaseListSpider):
+    """拟建列表页爬虫事务类"""
+
+    __business_type__ = "PlanToBuildList"
+
+    pass
+
+
+class PlanToBuildDetailSpider(BusinessBaseDetailSpider):
+    """拟建详情页爬虫事务类"""
+
+    __business_type__ = "PlanToBuildDetail"
+    db_name = "njpc_list"
+
+    def failed_request(self, request, response):
+        """请求、解析错误次数超过上限后,将原信息重新保存至数据库,并修改failed字段"""
+        _data = request.item if isinstance(request.item, dict) else request.item.to_dict
+        item = Item(origin_data=_data)
+        item.table_name = self.err_coll_name
+        item.status_code = getattr(response, "status_code", -1)
+        item.err_requests = int(getattr(item, "err_requests", 0)) + 1
+        item.err_reason = getattr(request, "error_msg", "")
+        item.create_at = tools.ensure_int64(tools.get_current_timestamp())
+        item.origin = self.db_name
+        item.spidercode = _data["spidercode"]
+        yield item

+ 1 - 5
FworkSpider/feapder/db/__init__.py

@@ -159,7 +159,6 @@ class MongoDB:
         try:
             collection.insert_one(data)
         except DuplicateKeyError as e:
-            data.pop("_id", "")
             # 存在则更新
             if update_columns:
                 if not isinstance(update_columns, (tuple, list)):
@@ -236,7 +235,6 @@ class MongoDB:
                     # 数据重复
                     # 获取重复的数据
                     data = error.get("op")
-                    data.pop("_id", "")
 
                     def get_condition():
                         # 获取更新条件
@@ -265,9 +263,7 @@ class MongoDB:
                             }
                         else:
                             # 使用数据本身的值更新
-                            doc = {}
-                            for key in update_columns:
-                                doc = {key: data.get(key)}
+                            doc = {key: data.get(key) for key in update_columns}
 
                         collection.update_one(get_condition(), {"$set": doc})
                         add_count -= 1

+ 107 - 76
FworkSpider/feapder/db/mysqldb.py

@@ -2,48 +2,89 @@
 """
 Created on 2018-12-13 21:08
 ---------
-@summary:  sha256 redis集群去重,正式环境使用的去重方式
+@summary:
 ---------
 @author: Boris
 @email: boris_liu@foxmail.com
 """
 
 import copy
-from typing import Any, List, Union, Tuple, Callable
-import rediscluster
-from Crypto.Hash import SHA256
-from feapder import setting
+from typing import Any, List, Union, Tuple, Callable, Optional
+
+from feapder.utils.tools import get_md5
+from .bloomfilter import BloomFilter, ScalableBloomFilter
+from .expirefilter import ExpireFilter
+from .litefilter import LiteFilter
+from .swordfishfilter import SwordFishFilter
+
 
 class Dedup:
     BloomFilter = 1
     MemoryFilter = 2
     ExpireFilter = 3
-    def __init__(self,ilter_type: int = BloomFilter):
-        self._to_sha256 = True
-        self._to_redis = None
+    LiteFilter = 4
+    SwordFishFilter = 5
+
+    def __init__(self, filter_type: int = BloomFilter, to_md5: bool = True, **kwargs):
+        if filter_type == Dedup.ExpireFilter:
+            try:
+                expire_time = kwargs["expire_time"]
+            except:
+                raise ValueError("需传参数 expire_time")
+
+            name = kwargs.get("absolute_name") or "dedup:expire_set:%s" % kwargs.get(
+                "name", expire_time
+            )
+            expire_time_record_key = "dedup:expire_set:expire_time"
+
+            self.dedup = ExpireFilter(
+                name=name,
+                expire_time=expire_time,
+                expire_time_record_key=expire_time_record_key,
+                redis_url=kwargs.get("redis_url"),
+            )
+        elif filter_type == Dedup.SwordFishFilter:
+            self.dedup = SwordFishFilter(
+                redis_url=kwargs.get("redis_url"),
+                expire_time=kwargs.get("expire_time")
+            )
+        else:
+            initial_capacity = kwargs.get("initial_capacity", 100000000)
+            error_rate = kwargs.get("error_rate", 0.00001)
+            name = kwargs.get("absolute_name") or "dedup:bloomfilter:" + kwargs.get("name", "bloomfilter")
+            if filter_type == Dedup.BloomFilter:
+                self.dedup = ScalableBloomFilter(
+                    name=name,
+                    initial_capacity=initial_capacity,
+                    error_rate=error_rate,
+                    bitarray_type=ScalableBloomFilter.BASE_REDIS,
+                    redis_url=kwargs.get("redis_url"),
+                )
+            elif filter_type == Dedup.MemoryFilter:
+                self.dedup = ScalableBloomFilter(
+                    name=name,
+                    initial_capacity=initial_capacity,
+                    error_rate=error_rate,
+                    bitarray_type=ScalableBloomFilter.BASE_MEMORY,
+                )
+            elif filter_type == Dedup.LiteFilter:
+                self.dedup = LiteFilter()
+            else:
+                raise ValueError(
+                    "filter_type 类型错误,仅支持 Dedup.BloomFilter、Dedup.MemoryFilter、Dedup.ExpireFilter"
+                )
 
-    @property
-    def redis_cluster(self): # 连接redis集群
-        if not self._to_redis:
-            startup_nodes = [{"host": i.get("host"), "port": i.get("port")} for i in setting.REDISCLUSTER]
-            self._to_redis =  rediscluster.RedisCluster(startup_nodes=startup_nodes, decode_responses=True)
-        return self._to_redis
+        self._to_md5 = to_md5
 
     def __repr__(self):
-        return 'sha256'
-    def sha256(self,info):
-        if info is None:
-            return ''
-        res = SHA256.new(info.encode('utf-8'))
-        data = res.hexdigest()
-        return data
-
-    def _deal_datas(self, datas): # 对datas进行加密处理
-        if self._to_sha256:
+        return str(self.dedup)
+
+    def _deal_datas(self, datas):
+        if self._to_md5:
             if isinstance(datas, list):
-                keys = [self.sha256(data) for data in datas]
+                keys = [get_md5(data) for data in datas]
             else:
-                keys = self.sha256(datas)
+                keys = get_md5(datas)
         else:
             keys = copy.deepcopy(datas)
 
@@ -58,35 +99,11 @@ class Dedup:
         @param skip_check: 是否直接添加,不检查是否存在 适用于bloomfilter,加快add速度
         @return: list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
         """
+
         keys = self._deal_datas(datas)
-        is_added = self.insert_key(keys, skip_check)
+        is_added = self.dedup.add(keys, skip_check)
 
         return is_added
-    def insert_key(self,keys,skip_check):
-        if isinstance(keys, list):
-            for key in keys:
-                if not self.redis_cluster.exists("pylist_"+key):
-                    self.redis_cluster.set("pylist_"+key, 1,ex=86400*365*2)
-        else:
-            if not self.redis_cluster.exists("pylist_"+keys):
-                self.redis_cluster.set("pylist_"+keys,1,ex=86400*365*2)
-
-    def exists(self,keys):
-        exists = []
-        if isinstance(keys, list):
-            for key in keys:
-                exists.append(self.exit_key(key))
-        else:
-            exists.append(self.exit_key(keys))
-        return exists
-    def exit_key(self,key):
-        if self.redis_cluster.exists(key):
-            return True
-        if self.redis_cluster.exists("pylist_"+key):
-            return True
-        return False
-
-
 
     def get(self, datas: Union[List[Any], Any]) -> Union[List[Any], Any]:
         """
@@ -95,44 +112,58 @@ class Dedup:
         @return: list / 单个值 (存在返回1 不存在返回0)
         """
         keys = self._deal_datas(datas)
-        is_exists = self.exists(keys)
+        is_exists = self.dedup.get(keys)
 
         return is_exists
 
-
     def filter_exist_data(
         self,
         datas: List[Any],
         *,
+        datas_fingerprints: Optional[List] = None,
         callback: Callable[[Any], None] = None
     ) -> Union[Tuple[List[Any], List[Any]], List[Any]]:
         """
         过滤掉已存在的数据
+        *** 直接修改原来的数据 使用完此方法后 datas, datas_fingerprints 里面的值为去重后的数据
+        @param datas_fingerprints: 数据的唯一指纹 列表
         @param datas: 数据 列表
         @param callback: 数据已存在时的回调 callback(data)
         @return: None
-        [0,1,1]
-        [b,c,d]
-        []
         """
-        is_exists = self.get(datas)
+
+        is_exists = self.get(datas_fingerprints or datas)
+
         dedup_datas = []
-        while is_exists:
-            data = datas.pop(0)
-            is_exist = is_exists.pop(0)
 
-            if not is_exist:
-                dedup_datas.append(data)
-            else:
-                if callback:
-                    callback(data)
-
-        datas.extend(dedup_datas)
-        return datas
-
-if __name__ == '__main__':
-    dedup = Dedup(Dedup.BloomFilter)
-    href = 'http://www.ccgp-tianjin.gov.cn/viewer.do?id=339715380&ver=2222'
-    ss = dedup.filter_exist_data([href])
-    # res = dedup.add([href,'llk'])
-    print(ss)
+        if datas_fingerprints:
+            dedup_datas_fingerprints = []
+            while is_exists:
+                data = datas.pop(0)
+                is_exist = is_exists.pop(0)
+                data_fingerprint = datas_fingerprints.pop(0)
+
+                if not is_exist:
+                    dedup_datas.append(data)
+                    dedup_datas_fingerprints.append(data_fingerprint)
+                else:
+                    if callback:
+                        callback(data)
+
+            datas_fingerprints.extend(dedup_datas_fingerprints)
+            datas.extend(dedup_datas)
+            return datas, datas_fingerprints
+
+        else:
+            while is_exists:
+                data = datas.pop(0)
+                is_exist = is_exists.pop(0)
+
+                if not is_exist:
+                    dedup_datas.append(data)
+                else:
+                    if callback:
+                        callback(data)
+
+            datas.extend(dedup_datas)
+            return datas

+ 14 - 20
FworkSpider/feapder/dedup/bitarray.py

@@ -14,7 +14,7 @@ import threading
 import time
 from struct import unpack, pack
 
-from feapder.db.redisdb import RedisDB
+from feapder.dedup.basefilter import BaseFilter
 from feapder.utils.redis_lock import RedisLock
 from . import bitarray
 
@@ -146,24 +146,18 @@ class BloomFilter(object):
         比较耗时 半小时检查一次
         @return:
         """
-        # if self._is_at_capacity:
-        #     return self._is_at_capacity
-        #
-        # if not self._check_capacity_time or time.time() - self._check_capacity_time > 1800:
-        #     bit_count = self.bitarray.count()
-        #     if bit_count and bit_count / self.num_bits > 0.5:
-        #         self._is_at_capacity = True
-        #
-        #     self._check_capacity_time = time.time()
-        #
-        # return self._is_at_capacity
-
         if self._is_at_capacity:
             return self._is_at_capacity
 
-        bit_count = self.bitarray.count()
-        if bit_count and bit_count / self.num_bits > 0.5:
-            self._is_at_capacity = True
+        if (
+            not self._check_capacity_time
+            or time.time() - self._check_capacity_time > 1800
+        ):
+            bit_count = self.bitarray.count()
+            if bit_count and bit_count / self.num_bits > 0.5:
+                self._is_at_capacity = True
+
+            self._check_capacity_time = time.time()
 
         return self._is_at_capacity
 
@@ -174,8 +168,8 @@ class BloomFilter(object):
         @param keys: list or one key
         @return:
         """
-        if self.is_at_capacity:
-            raise IndexError("BloomFilter is at capacity")
+        # if self.is_at_capacity:
+        #     raise IndexError("BloomFilter is at capacity")
 
         is_list = isinstance(keys, list)
 
@@ -197,7 +191,7 @@ class BloomFilter(object):
         return is_added if is_list else is_added[0]
 
 
-class ScalableBloomFilter(object):
+class ScalableBloomFilter(BaseFilter):
     """
     自动扩展空间的bloomfilter, 当一个filter满一半的时候,创建下一个
     """
@@ -273,7 +267,7 @@ class ScalableBloomFilter(object):
                     if self.name
                     else "ScalableBloomFilter"
                 )
-                with RedisLock(key=key) as lock:
+                with RedisLock(key=key, redis_url=self.redis_url) as lock:
                     if lock.locked:
                         while True:
                             if self.filters[-1].is_at_capacity:

+ 13 - 2
FworkSpider/feapder/dedup/expirefilter.py

@@ -11,9 +11,10 @@ Created on 2018/12/13 9:44 PM
 import time
 
 from feapder.db.redisdb import RedisDB
+from feapder.dedup.basefilter import BaseFilter
 
 
-class ExpireFilter:
+class ExpireFilter(BaseFilter):
     redis_db = None
 
     def __init__(
@@ -55,7 +56,17 @@ class ExpireFilter:
         return is_added
 
     def get(self, keys):
-        return self.redis_db.zexists(self.name, keys)
+        is_exist = self.redis_db.zexists(self.name, keys)
+        if isinstance(keys, list):
+            # 判断数据本身是否重复
+            temp_set = set()
+            for i, key in enumerate(keys):
+                if key in temp_set:
+                    is_exist[i] = 1
+                else:
+                    temp_set.add(key)
+
+        return is_exist
 
     def del_expire_key(self):
         self.redis_db.zremrangebyscore(

+ 0 - 178
FworkSpider/feapder/dedup/old__init__.py

@@ -1,178 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-12-13 21:08
----------
-@summary: 布隆去重,测试框架使用的去重方式
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import copy
-from typing import Any, List, Union, Optional, Tuple, Callable
-
-from feapder.utils.tools import get_md5
-from .bloomfilter import BloomFilter, ScalableBloomFilter
-from .expirefilter import ExpireFilter
-
-
-class Dedup:
-    BloomFilter = 1
-    MemoryFilter = 2
-    ExpireFilter = 3
-
-    def __init__(self, filter_type: int = BloomFilter, to_md5: bool = True, **kwargs):
-        """
-        去重过滤器 集成BloomFilter、MemoryFilter、ExpireFilter
-        Args:
-            filter_type: 过滤器类型 BloomFilter
-            name: 过滤器名称 该名称会默认以dedup作为前缀 dedup:expire_set:[name]/dedup:bloomfilter:[name]。 默认ExpireFilter name=过期时间; BloomFilter name=dedup:bloomfilter:bloomfilter
-            absolute_name: 过滤器绝对名称 不会加dedup前缀,当此值不为空时name参数无效
-            expire_time: ExpireFilter的过期时间 单位为秒,其他两种过滤器不用指定
-            error_rate: BloomFilter/MemoryFilter的误判率 默认为0.00001
-            to_md5: 去重前是否将数据转为MD5,默认是
-            redis_url: redis://[[username]:[password]]@localhost:6379/0
-                       BloomFilter 与 ExpireFilter 使用
-                       默认会读取setting中的redis配置,若无setting,则需要专递redis_url
-            initial_capacity: 单个布隆过滤器去重容量 默认100000000,当布隆过滤器容量满时会扩展下一个布隆过滤器
-            error_rate:布隆过滤器的误判率 默认0.00001
-            **kwargs:
-        """
-
-        if filter_type == Dedup.ExpireFilter:
-            try:
-                expire_time = kwargs["expire_time"]
-            except:
-                raise ValueError("需传参数 expire_time")
-
-            name = kwargs.get("absolute_name") or "dedup:expire_set:%s" % kwargs.get(
-                "name", expire_time
-            )
-            expire_time_record_key = "dedup:expire_set:expire_time"
-
-            self.dedup = ExpireFilter(
-                name=name,
-                expire_time=expire_time,
-                expire_time_record_key=expire_time_record_key,
-                redis_url=kwargs.get("redis_url"),
-            )
-
-        else:
-            initial_capacity = kwargs.get("initial_capacity", 100000000)
-            error_rate = kwargs.get("error_rate", 0.00001)
-            name = kwargs.get("absolute_name") or "dedup:bloomfilter:" + kwargs.get(
-                "name", "bloomfilter"
-            )
-            if filter_type == Dedup.BloomFilter:
-                self.dedup = ScalableBloomFilter(
-                    name=name,
-                    initial_capacity=initial_capacity,
-                    error_rate=error_rate,
-                    bitarray_type=ScalableBloomFilter.BASE_REDIS,
-                    redis_url=kwargs.get("redis_url"),
-                )
-            elif filter_type == Dedup.MemoryFilter:
-                self.dedup = ScalableBloomFilter(
-                    name=name,
-                    initial_capacity=initial_capacity,
-                    error_rate=error_rate,
-                    bitarray_type=ScalableBloomFilter.BASE_MEMORY,
-                )
-            else:
-                raise ValueError(
-                    "filter_type 类型错误,仅支持 Dedup.BloomFilter、Dedup.MemoryFilter、Dedup.ExpireFilter"
-                )
-
-        self._to_md5 = to_md5
-
-    def __repr__(self):
-        return str(self.dedup)
-
-    def _deal_datas(self, datas):
-        if self._to_md5:
-            if isinstance(datas, list):
-                keys = [get_md5(data) for data in datas]
-            else:
-                keys = get_md5(datas)
-        else:
-            keys = copy.deepcopy(datas)
-
-        return keys
-
-    def add(
-        self, datas: Union[List[Any], Any], skip_check: bool = False
-    ) -> Union[List[Any], Any]:
-        """
-        添加数据
-        @param datas: list / 单个值
-        @param skip_check: 是否直接添加,不检查是否存在 适用于bloomfilter,加快add速度
-        @return: list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
-        """
-
-        keys = self._deal_datas(datas)
-        is_added = self.dedup.add(keys, skip_check)
-
-        return is_added
-
-    def get(self, datas: Union[List[Any], Any]) -> Union[List[Any], Any]:
-        """
-        检查数据是否存在
-        @param datas: list / 单个值
-        @return: list / 单个值 (存在返回1 不存在返回0)
-        """
-        keys = self._deal_datas(datas)
-        is_exists = self.dedup.get(keys)
-
-        return is_exists
-
-    def filter_exist_data(
-        self,
-        datas: List[Any],
-        *,
-        datas_fingerprints: Optional[List] = None,
-        callback: Callable[[Any], None] = None
-    ) -> Union[Tuple[List[Any], List[Any]], List[Any]]:
-        """
-        过滤掉已存在的数据
-        *** 直接修改原来的数据 使用完此方法后 datas, datas_fingerprints 里面的值为去重后的数据
-        @param datas_fingerprints: 数据的唯一指纹 列表
-        @param datas: 数据 列表
-        @param callback: 数据已存在时的回调 callback(data)
-        @return: None
-        """
-
-        is_exists = self.get(datas_fingerprints or datas)
-
-        dedup_datas = []
-
-        if datas_fingerprints:
-            dedup_datas_fingerprints = []
-            while is_exists:
-                data = datas.pop(0)
-                is_exist = is_exists.pop(0)
-                data_fingerprint = datas_fingerprints.pop(0)
-
-                if not is_exist:
-                    dedup_datas.append(data)
-                    dedup_datas_fingerprints.append(data_fingerprint)
-                else:
-                    if callback:
-                        callback(data)
-
-            datas_fingerprints.extend(dedup_datas_fingerprints)
-            datas.extend(dedup_datas)
-            return datas, datas_fingerprints
-
-        else:
-            while is_exists:
-                data = datas.pop(0)
-                is_exist = is_exists.pop(0)
-
-                if not is_exist:
-                    dedup_datas.append(data)
-                else:
-                    if callback:
-                        callback(data)
-
-            datas.extend(dedup_datas)
-            return datas

+ 40 - 73
FworkSpider/feapder/network/__init__.py

@@ -16,15 +16,18 @@ import warnings
 from collections import Iterable
 from enum import Enum, unique
 
+import requests
+from func_timeout import func_set_timeout
+
 import feapder.utils.tools as tools
 from feapder import setting
-from feapder.db.mysqldb import MysqlDB
+from feapder.db.mongodb import MongoDB
 from feapder.db.redisdb import RedisDB
+from feapder.network import user_agent
 from feapder.utils import metrics
 from feapder.utils.log import log
 from feapder.utils.redis_lock import RedisLock
 from feapder.utils.tools import send_msg
-from feapder.utils.webdriver import WebDriver
 
 
 class CookiePoolInterface(metaclass=abc.ABCMeta):
@@ -101,21 +104,14 @@ class PageCookiePool(CookiePoolInterface):
         可能会重写
         @return:
         """
-        with WebDriver(**self._kwargs) as driver:
-            driver.get(self._page_url)
-            cookies = driver.get_cookies()
-            cookies_json = {}
-            for cookie in cookies:
-                cookies_json[cookie["name"]] = cookie["value"]
-
-            for key in self._must_contained_keys:
-                if key not in cookies_json:
-                    break
-            else:
-                return cookies_json
-
-            log.error("获取cookie失败 cookies = {}".format(cookies_json))
-            return None
+        url = self._page_url
+        header = {
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": user_agent.get()
+        }
+        res = requests.get(url, headers=header)
+        cookies = requests.utils.dict_from_cookiejar(res.cookies)
+        return cookies
 
     def add_cookies(self, cookies):
         log.info("添加cookie {}".format(cookies))
@@ -126,7 +122,6 @@ class PageCookiePool(CookiePoolInterface):
             try:
                 now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool)
                 need_cookie_count = self._min_cookies - now_cookie_count
-
                 if need_cookie_count > 0:
                     log.info(
                         "当前cookie数为 {} 小于 {}, 生产cookie".format(
@@ -141,7 +136,6 @@ class PageCookiePool(CookiePoolInterface):
                         log.exception(e)
                 else:
                     log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count))
-
                     # 判断cookie池近一分钟数量是否有变化,无变化则认为爬虫不再用了,退出
                     last_count_info = self._redisdb.strget(
                         self._tab_cookie_pool_last_count
@@ -176,6 +170,7 @@ class PageCookiePool(CookiePoolInterface):
                 log.exception(e)
                 tools.delay_time(1)
 
+    @func_set_timeout(120)
     def get_cookie(self, wait_when_null=True):
         while True:
             try:
@@ -184,9 +179,10 @@ class PageCookiePool(CookiePoolInterface):
                     log.info("暂无cookie 生产中...")
                     self._keep_alive = False
                     self._min_cookies = 1
-                    with RedisLock(
-                        key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5
-                    ) as _lock:
+                    _lock = RedisLock(key=self._tab_cookie_pool,
+                                      lock_timeout=3600,
+                                      wait_timeout=5)
+                    with _lock:
                         if _lock.locked:
                             self.run()
                     continue
@@ -240,25 +236,10 @@ class LoginCookiePool(CookiePoolInterface):
         self._password_key = password_key
 
         self._redisdb = RedisDB()
-        self._mysqldb = ()
-
-        self.create_userbase()
-
-    def create_userbase(self):
-        sql = f"""
-            CREATE TABLE IF NOT EXISTS `{self._table_userbase}` (
-              `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
-              `{self._username_key}` varchar(50) DEFAULT NULL COMMENT '用户名',
-              `{self._password_key}` varchar(255) DEFAULT NULL COMMENT '密码',
-              `{self._login_state_key}` int(11) DEFAULT '0' COMMENT '登录状态(0未登录 1已登录)',
-              `{self._lock_state_key}` int(11) DEFAULT '0' COMMENT '账号是否被封(0 未封 1 被封)',
-              PRIMARY KEY (`id`),
-              UNIQUE KEY `username` (`username`) USING BTREE
-            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
-        """
-        self._mysqldb.execute(sql)
+        self._mongo = MongoDB(db='user_login')
 
     def create_cookie(self, username, password):
+
         """
         创建cookie
         @param username: 用户名
@@ -273,15 +254,7 @@ class LoginCookiePool(CookiePoolInterface):
         @return: yield username, password
         """
 
-        sql = "select {username_key}, {password_key} from {table_userbase} where {lock_state_key} != 1 and {login_state_key} != 1".format(
-            username_key=self._username_key,
-            password_key=self._password_key,
-            table_userbase=self._table_userbase,
-            lock_state_key=self._lock_state_key,
-            login_state_key=self._login_state_key,
-        )
-
-        return self._mysqldb.find(sql)
+        return self._mongo.find(self._table_userbase,{self._lock_state_key:0,self._login_state_key:0})
 
     def handle_login_failed_user(self, username, password):
         """
@@ -305,16 +278,13 @@ class LoginCookiePool(CookiePoolInterface):
         user_cookie = {"username": username, "cookie": cookie}
 
         self._redisdb.lpush(self._tab_cookie_pool, user_cookie)
+        self._mongo.add(
+                coll_name=self._table_userbase,
+                data={self._login_state_key:1},
+                update_columns=self._username_key,
+                update_columns_value=username)
 
-        sql = "update {table_userbase} set {login_state_key} = 1 where {username_key} = '{username}'".format(
-            table_userbase=self._table_userbase,
-            login_state_key=self._login_state_key,
-            username_key=self._username_key,
-            username=username,
-        )
-
-        self._mysqldb.update(sql)
-
+    @func_set_timeout(60)
     def get_cookie(self, wait_when_null=True) -> User:
         while True:
             try:
@@ -342,24 +312,19 @@ class LoginCookiePool(CookiePoolInterface):
         user_info = {"username": user.username, "cookie": user.cookie}
         self._redisdb.lrem(self._tab_cookie_pool, user_info)
 
-        sql = "update {table_userbase} set {login_state_key} = 0 where {username_key} = '{username}'".format(
-            table_userbase=self._table_userbase,
-            login_state_key=self._login_state_key,
-            username_key=self._username_key,
-            username=user.username,
-        )
-
-        self._mysqldb.update(sql)
+        self._mongo.add(
+            coll_name=self._table_userbase,
+            data={self._login_state_key: 1},
+            update_columns=self._username_key,
+            update_columns_value=user.username)
 
     def user_is_locked(self, user: User):
-        sql = "update {table_userbase} set {lock_state_key} = 1 where {username_key} = '{username}'".format(
-            table_userbase=self._table_userbase,
-            lock_state_key=self._lock_state_key,
-            username_key=self._username_key,
-            username=user.username,
-        )
 
-        self._mysqldb.update(sql)
+        self._mongo.add(
+            coll_name=self._table_userbase,
+            data={self._lock_state_key: 1},
+            update_columns=self._username_key,
+            update_columns_value=user.username)
 
     def run(self):
         with RedisLock(
@@ -373,7 +338,9 @@ class LoginCookiePool(CookiePoolInterface):
                 if not user_infos:
                     log.info("无可用用户")
 
-                for username, password in user_infos:
+                for info in user_infos:
+                    username = info.get("username")
+                    password = info.get("password")
                     for i in range(self._login_retry_times):
                         try:
                             cookie = self.create_cookie(username, password)

+ 0 - 20
FworkSpider/feapder/network/item.py

@@ -1,20 +0,0 @@
-117.88.5.96:8860

-111.179.93.27:8861

-111.179.93.27:8860

-113.226.100.155:8861

-113.226.100.155:8860

-114.99.103.81:8861

-171.13.51.41:8861

-114.99.103.81:8860

-171.13.51.41:8860

-125.41.17.67:8861

-125.41.17.67:8860

-113.123.0.127:8861

-117.88.5.96:8861

-182.101.196.230:8861

-113.123.0.127:8860

-182.101.196.230:8860

-182.34.102.234:8861

-182.34.102.234:8860

-117.88.4.100:8861

-117.88.4.100:8860

+ 0 - 20
FworkSpider/feapder/network/proxy_file/a62f3217a0981b7b2117d9d0af64c2db.txt

@@ -1,20 +0,0 @@
-122.159.219.174:8860&&1653299700
-182.34.19.216:8860&&1653299010
-106.35.223.168:8861&&1653298655
-125.45.91.69:8861&&1653298844
-125.45.91.69:8860&&1653298844
-122.159.219.174:8861&&1653299700
-106.35.223.168:8860&&1653298655
-182.34.19.216:8861&&1653299010
-113.121.20.254:8861&&1653300488
-125.72.106.216:8861&&1653300251
-113.121.20.254:8860&&1653300488
-125.72.106.216:8860&&1653300251
-119.112.80.248:8861&&1653298967
-119.112.80.248:8860&&1653298967
-58.213.26.197:8860&&1653298952
-58.213.26.197:8861&&1653298952
-113.226.110.38:8861&&1653300048
-113.226.110.38:8860&&1653300048
-113.121.41.156:8860&&1653299102
-113.121.41.156:8861&&1653299102

+ 5 - 9
FworkSpider/feapder/network/proxy_pool.py

@@ -1,6 +1,6 @@
-# coding:utf8
+# -*- coding: utf-8 -*-
 """
-代理池  弃用
+代理池
 """
 import datetime
 import json
@@ -122,9 +122,9 @@ def get_proxy_from_http(proxy_source_url, **kwargs):
         response = requests.get(proxy_source_url, timeout=20)
         # 改写:获取scocks代理的response处理
         for proxy in response.json():
-            host = decrypt(proxy['host'])
-            port = proxy['port']
-            endTime = proxy['EndTime']
+            host = decrypt(proxy['ip'])
+            port = proxy['ports'][0]
+            endTime = proxy['lifetime']
             pool.append(f"{host}:{port}&&{endTime}")
 
         with open(os.path.join(proxy_path, filename), "w") as f:
@@ -757,7 +757,3 @@ class ProxyPool(ProxyPoolBase):
         :return:
         """
         return get_proxy_from_url(**self.kwargs)
-# 
-# 
-# if __name__ == '__main__':
-#     ProxyPool().get()

+ 23 - 37
FworkSpider/feapder/network/request.py

@@ -7,9 +7,10 @@ Created on 2018-07-25 11:49:08
 @author: Boris
 @email:  boris_liu@foxmail.com
 """
+import copy
+import re
 
 import requests
-from func_timeout import func_set_timeout, FunctionTimedOut
 from requests.adapters import HTTPAdapter
 from requests.cookies import RequestsCookieJar
 from requests.packages.urllib3.exceptions import InsecureRequestWarning
@@ -18,11 +19,10 @@ import feapder.setting as setting
 import feapder.utils.tools as tools
 from feapder.db.redisdb import RedisDB
 from feapder.network import user_agent
-from feapder.network.proxy_pool import ProxyPool
 from feapder.network.response import Response
-from feapder.utils.log import Log
+from feapder.utils.log import log
 from feapder.utils.webdriver import WebDriverPool
-log = Log()
+
 # 屏蔽warning信息
 requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
 
@@ -40,7 +40,7 @@ class Request(object):
     oss_handler = None
 
     __REQUEST_ATTRS__ = {
-        # 'method', 'url', 必须传递 不加入**kwargs中
+        # "method", "url", 必须传递 不加入**kwargs中
         "params",
         "data",
         "headers",
@@ -92,6 +92,7 @@ class Request(object):
         render_time=0,
         splash=False,
         iframes=0,
+        rel_count=0,
         **kwargs,
     ):
         """
@@ -149,6 +150,7 @@ class Request(object):
         self.render = render
         self.splash = splash
         self.iframes = iframes
+        self.rel_count = rel_count
         self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
 
         self.requests_kwargs = {}
@@ -200,7 +202,6 @@ class Request(object):
 
         return self.__class__.webdriver_pool
 
-
     @property
     def to_dict(self):
         request_dict = {}
@@ -245,7 +246,6 @@ class Request(object):
             else self.callback
         )
 
-    @func_set_timeout(30)
     def get_response(self, save_cached=False):
         """
         获取带有selector功能的response
@@ -258,7 +258,9 @@ class Request(object):
         )  # connect=22 read=22
 
         # 设置stream
-        # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
+        # 默认情况下,当你进行网络请求后,响应体会立即被下载。
+        # stream=True是,调用Response.content 才会下载响应体,默认只返回header。
+        # 缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
         self.requests_kwargs.setdefault("stream", True)
 
         # 关闭证书验证
@@ -267,7 +269,7 @@ class Request(object):
         # 设置请求方法
         method = self.__dict__.get("method")
         if not method:
-            if "data" in self.requests_kwargs:
+            if "data" in self.requests_kwargs or "json" in self.requests_kwargs:
                 method = "POST"
             else:
                 method = "GET"
@@ -329,7 +331,6 @@ class Request(object):
             )
         )
 
-
         use_session = (
             setting.USE_SESSION if self.use_session is None else self.use_session
         )  # self.use_session 优先级高
@@ -338,7 +339,7 @@ class Request(object):
             # 使用request的user_agent、cookies、proxy
             user_agent = headers.get("User-Agent") or headers.get("user-agent")
             cookies = self.requests_kwargs.get("cookies")
-            print(cookies)
+            print(f'cookies >>>  {cookies}')
             if cookies and isinstance(cookies, RequestsCookieJar):
                 cookies = cookies.get_dict()
 
@@ -347,9 +348,7 @@ class Request(object):
                 if cookie_str:
                     cookies = tools.get_cookies_from_str(cookie_str)
 
-
             browser = self._webdriver_pool.get(user_agent=user_agent, proxy=False)
-
             try:
                 if proxies:
                     self.chage_ip(browser)
@@ -375,24 +374,21 @@ class Request(object):
                         },
                     }
                 )
-
                 response.browser = browser
             except Exception as e:
                 self._webdriver_pool.remove(browser)
                 raise e
-
         elif use_session:
             response = self._session.request(method, self.url, **self.requests_kwargs)
             response = Response(response)
         elif self.splash:
-            resp = requests.get(setting.JIANYU_SPLASH_URL, params={
+            resp = requests.get(setting.SWORDFISH_RENDER_URL, params={
                 'iframes': self.iframes,
                 'wait': self.render_time,
                 'html': 1,
-                'proxy': self.get_proxy().get("http"),
+                'proxy': {} if self.proxies == False else self.get_proxy().get("http"),
                 'url': self.url
             })
-
             response = Response(resp)
 
             # if self.iframes:
@@ -433,7 +429,6 @@ class Request(object):
 
         if save_cached:
             self.save_cached(response, expire_time=self.__class__.cached_expire_time)
-        log.info("requests",extra={"url":response.url,"code":response.status_code})
         return response
 
     def proxies(self):
@@ -450,19 +445,17 @@ class Request(object):
         """
         proxies = self.proxies()
         if proxies:
-            return proxies.get("http", "").strip("http://") or proxies.get(
-                "https", ""
-            ).strip("https://")
+            return re.sub(
+                "http.*?//", "", proxies.get("http", "") or proxies.get("https", "")
+            )
 
     def get_proxy(self):
-        headers = {
-            "Authorization": setting.JIANYU_PROXY_AUTHOR
-        }
-        proxy = requests.get(setting.JIANYU_PROXY_URL, headers=headers).json()
+        headers = {"Authorization": setting.SWORDFISH_PROXY_AUTHOR}
+        proxy = requests.get(setting.SWORDFISH_PROXY_URL, headers=headers).json()
         print(f"切换代理:{proxy.get('data')}")
         return proxy.get("data")
 
-    def chage_ip(self,browser):
+    def chage_ip(self, browser):
         ip = self.get_proxy().get("http")  # ip格式"127.0.0.1:80"
         ip = ip.split("//")[-1]
         browser.get("about:config")
@@ -472,8 +465,7 @@ class Request(object):
         prefs.setIntPref("network.proxy.type", 1);
         prefs.setCharPref("network.proxy.socks", "%s");
         prefs.setIntPref("network.proxy.socks_port", "%s");
-        ''' % (
-        ip.split(':')[0], ip.split(':')[1])
+        ''' % (ip.split(':')[0], ip.split(':')[1])
         # 执行js
         browser.execute_script(setupScript)
 
@@ -542,13 +534,7 @@ class Request(object):
         response_dict = self._cache_db.strget(self._cached_redis_key)
         if not response_dict:
             log.info("无response缓存  重新下载")
-            try:
-                response_obj = self.get_response(save_cached=save_cached)
-            except FunctionTimedOut:
-                response_obj = None
-                log.info("请求超时")
-                log.info("requests", extra={"url": self.url, "code": 0})
-
+            response_obj = self.get_response(save_cached=save_cached)
         else:
             response_dict = eval(response_dict)
             response_obj = Response.from_dict(response_dict)
@@ -566,4 +552,4 @@ class Request(object):
         return cls(**request_dict)
 
     def copy(self):
-        return self.__class__.from_dict(self.to_dict)
+        return self.__class__.from_dict(copy.deepcopy(self.to_dict))

+ 0 - 513
FworkSpider/feapder/network/request6.29.py

@@ -1,513 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-07-25 11:49:08
----------
-@summary: 请求结构体
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import requests
-from func_timeout import func_set_timeout, FunctionTimedOut
-from requests.adapters import HTTPAdapter
-from requests.cookies import RequestsCookieJar
-from requests.packages.urllib3.exceptions import InsecureRequestWarning
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.db.redisdb import RedisDB
-from feapder.network import user_agent
-from feapder.network.proxy_pool import ProxyPool
-from feapder.network.response import Response
-from feapder.utils.log import Log
-from feapder.utils.webdriver import WebDriverPool
-log = Log()
-# 屏蔽warning信息
-requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
-
-
-class Request(object):
-    session = None
-    webdriver_pool: WebDriverPool = None
-    user_agent_pool = user_agent
-    proxies_pool: ProxyPool = None
-
-    cache_db = None  # redis / pika
-    cached_redis_key = None  # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
-    cached_expire_time = 1200  # 缓存过期时间
-
-    local_filepath = None
-    oss_handler = None
-
-    __REQUEST_ATTRS__ = {
-        # 'method', 'url', 必须传递 不加入**kwargs中
-        "params",
-        "data",
-        "headers",
-        "cookies",
-        "files",
-        "auth",
-        "timeout",
-        "allow_redirects",
-        "proxies",
-        "hooks",
-        "stream",
-        "verify",
-        "cert",
-        "json",
-    }
-
-    DEFAULT_KEY_VALUE = dict(
-        url="",
-        retry_times=0,
-        priority=300,
-        parser_name=None,
-        callback=None,
-        filter_repeat=True,
-        auto_request=True,
-        request_sync=False,
-        use_session=None,
-        random_user_agent=True,
-        download_midware=None,
-        is_abandoned=False,
-        render=False,
-        render_time=0,
-    )
-
-    def __init__(
-        self,
-        url="",
-        retry_times=0,
-        priority=300,
-        parser_name=None,
-        callback=None,
-        filter_repeat=True,
-        auto_request=True,
-        request_sync=False,
-        use_session=None,
-        random_user_agent=True,
-        download_midware=None,
-        is_abandoned=False,
-        render=False,
-        render_time=0,
-        **kwargs,
-    ):
-        """
-        @summary: Request参数
-        ---------
-        框架参数
-        @param url: 待抓取url
-        @param retry_times: 当前重试次数
-        @param priority: 优先级 越小越优先 默认300
-        @param parser_name: 回调函数所在的类名 默认为当前类
-        @param callback: 回调函数 可以是函数 也可是函数名(如想跨类回调时,parser_name指定那个类名,callback指定那个类想回调的方法名即可)
-        @param filter_repeat: 是否需要去重 (True/False) 当setting中的REQUEST_FILTER_ENABLE设置为True时该参数生效 默认True
-        @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空,需要自己去请求网页
-        @param request_sync: 是否同步请求下载网页,默认异步。如果该请求url过期时间快,可设置为True,相当于yield的reqeust会立即响应,而不是去排队
-        @param use_session: 是否使用session方式
-        @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
-        @param download_midware: 下载中间件。默认为parser中的download_midware
-        @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
-        @param render: 是否用浏览器渲染
-        @param render_time: 渲染时长,即打开网页等待指定时间后再获取源码
-        --
-        以下参数与requests参数使用方式一致
-        @param method: 请求方式,如POST或GET,默认根据data值是否为空来判断
-        @param params: 请求参数
-        @param data: 请求body
-        @param json: 请求json字符串,同 json.dumps(data)
-        @param headers:
-        @param cookies: 字典 或 CookieJar 对象
-        @param files:
-        @param auth:
-        @param timeout: (浮点或元组)等待服务器数据的超时限制,是一个浮点数,或是一个(connect timeout, read timeout) 元组
-        @param allow_redirects : Boolean. True 表示允许跟踪 POST/PUT/DELETE 方法的重定向
-        @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
-        @param verify: 为 True 时将会验证 SSL 证书
-        @param stream: 如果为 False,将会立即下载响应内容
-        @param cert:
-        --
-        @param **kwargs: 其他值: 如 Request(item=item) 则item可直接用 request.item 取出
-        ---------
-        @result:
-        """
-
-        self.url = url
-        self.retry_times = retry_times
-        self.priority = priority
-        self.parser_name = parser_name
-        self.callback = callback
-        self.filter_repeat = filter_repeat
-        self.auto_request = auto_request
-        self.request_sync = request_sync
-        self.use_session = use_session
-        self.random_user_agent = random_user_agent
-        self.download_midware = download_midware
-        self.is_abandoned = is_abandoned
-        self.render = render
-        self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
-
-        self.requests_kwargs = {}
-        for key, value in kwargs.items():
-            if key in self.__class__.__REQUEST_ATTRS__:  # 取requests参数
-                self.requests_kwargs[key] = value
-
-            self.__dict__[key] = value
-
-    def __repr__(self):
-        try:
-            return "<Request {}>".format(self.url)
-        except:
-            return "<Request {}>".format(str(self.to_dict)[:40])
-
-    def __setattr__(self, key, value):
-        """
-        针对 request.xxx = xxx 的形式,更新reqeust及内部参数值
-        @param key:
-        @param value:
-        @return:
-        """
-        self.__dict__[key] = value
-
-        if key in self.__class__.__REQUEST_ATTRS__:
-            self.requests_kwargs[key] = value
-
-    def __lt__(self, other):
-        return self.priority < other.priority
-
-    @property
-    def _session(self):
-        use_session = (
-            setting.USE_SESSION if self.use_session is None else self.use_session
-        )  # self.use_session 优先级高
-        if use_session and not self.__class__.session:
-            self.__class__.session = requests.Session()
-            # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
-            http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
-            # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
-            self.__class__.session.mount("http", http_adapter)
-
-        return self.__class__.session
-
-    @property
-    def _webdriver_pool(self):
-        if not self.__class__.webdriver_pool:
-            self.__class__.webdriver_pool = WebDriverPool(**setting.WEBDRIVER)
-
-        return self.__class__.webdriver_pool
-
-    @property
-    def _proxies_pool(self):
-        if not self.__class__.proxies_pool:
-            self.__class__.proxies_pool = ProxyPool()
-
-        return self.__class__.proxies_pool
-
-    @property
-    def to_dict(self):
-        request_dict = {}
-
-        self.callback = (
-            getattr(self.callback, "__name__")
-            if callable(self.callback)
-            else self.callback
-        )
-        self.download_midware = (
-            getattr(self.download_midware, "__name__")
-            if callable(self.download_midware)
-            else self.download_midware
-        )
-
-        for key, value in self.__dict__.items():
-            if (
-                key in self.__class__.DEFAULT_KEY_VALUE
-                and self.__class__.DEFAULT_KEY_VALUE.get(key) == value
-                or key == "requests_kwargs"
-            ):
-                continue
-
-            if key in self.__class__.__REQUEST_ATTRS__:
-                if not isinstance(
-                    value, (bytes, bool, float, int, str, tuple, list, dict)
-                ):
-                    value = tools.dumps_obj(value)
-            else:
-                if not isinstance(value, (bytes, bool, float, int, str)):
-                    value = tools.dumps_obj(value)
-
-            request_dict[key] = value
-
-        return request_dict
-
-    @property
-    def callback_name(self):
-        return (
-            getattr(self.callback, "__name__")
-            if callable(self.callback)
-            else self.callback
-        )
-
-    @func_set_timeout(30)
-    def get_response(self, save_cached=False):
-        """
-        获取带有selector功能的response
-        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
-        @return:
-        """
-        # 设置超时默认时间
-        self.requests_kwargs.setdefault(
-            "timeout", setting.REQUEST_TIMEOUT
-        )  # connect=22 read=22
-
-        # 设置stream
-        # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
-        self.requests_kwargs.setdefault("stream", True)
-
-        # 关闭证书验证
-        self.requests_kwargs.setdefault("verify", False)
-
-        # 设置请求方法
-        method = self.__dict__.get("method")
-        if not method:
-            if "data" in self.requests_kwargs:
-                method = "POST"
-            else:
-                method = "GET"
-
-        # 随机user—agent
-        headers = self.requests_kwargs.get("headers", {})
-        if "user-agent" not in headers and "User-Agent" not in headers:
-            if self.render:  # 如果是渲染默认,优先使用WEBDRIVER中配置的ua
-                ua = setting.WEBDRIVER.get(
-                    "user_agent"
-                ) or self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
-            else:
-                ua = self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
-
-            if self.random_user_agent and setting.RANDOM_HEADERS:
-                headers.update({"User-Agent": ua})
-                self.requests_kwargs.update(headers=headers)
-        else:
-            self.requests_kwargs.setdefault(
-                "headers", {"User-Agent": setting.DEFAULT_USERAGENT}
-            )
-
-        # 代理
-        proxies = self.requests_kwargs.get("proxies", -1)
-        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
-            while True:
-                proxies = self._proxies_pool.get()
-                if proxies:
-                    self.requests_kwargs.update(proxies=proxies)
-                    break
-                else:
-                    log.debug("暂无可用代理 ...")
-
-        log.debug(
-            """
-                -------------- %srequest for ----------------
-                url  = %s
-                method = %s
-                body = %s
-                """
-            % (
-                ""
-                if not self.parser_name
-                else "%s.%s "
-                % (
-                    self.parser_name,
-                    (
-                        self.callback
-                        and callable(self.callback)
-                        and getattr(self.callback, "__name__")
-                        or self.callback
-                    )
-                    or "parse",
-                ),
-                self.url,
-                method,
-                self.requests_kwargs,
-            )
-        )
-
-        # def hooks(response, *args, **kwargs):
-        #     print(response.url)
-        #
-        # self.requests_kwargs.update(hooks={'response': hooks})
-
-        use_session = (
-            setting.USE_SESSION if self.use_session is None else self.use_session
-        )  # self.use_session 优先级高
-
-        if self.render:
-            # 使用request的user_agent、cookies、proxy
-            user_agent = headers.get("User-Agent") or headers.get("user-agent")
-            cookies = self.requests_kwargs.get("cookies")
-            print(cookies)
-            if cookies and isinstance(cookies, RequestsCookieJar):
-                cookies = cookies.get_dict()
-
-            if not cookies:
-                cookie_str = headers.get("Cookie") or headers.get("cookie")
-                if cookie_str:
-                    cookies = tools.get_cookies_from_str(cookie_str)
-
-            proxy = None
-            if proxies and proxies != -1:
-                proxy = proxies.get("http", "").strip("http://") or proxies.get(
-                    "https", ""
-                ).strip("https://")
-
-            browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy)
-
-            try:
-                browser.get(self.url)
-                if cookies:
-                    browser.cookies = cookies
-                if self.render_time:
-                    tools.delay_time(self.render_time)
-
-                html = browser.page_source
-                response = Response.from_dict(
-                    {
-                        "url": browser.current_url,
-                        "cookies": browser.cookies,
-                        "_content": html.encode(),
-                        "status_code": 200,
-                        "elapsed": 666,
-                        "headers": {
-                            "User-Agent": browser.execute_script(
-                                "return navigator.userAgent"
-                            ),
-                            "Cookie": tools.cookies2str(browser.cookies),
-                        },
-                    }
-                )
-
-                response.browser = browser
-            except Exception as e:
-                self._webdriver_pool.remove(browser)
-                raise e
-
-        elif use_session:
-            response = self._session.request(method, self.url, **self.requests_kwargs)
-            response = Response(response)
-        else:
-            response = requests.request(method, self.url, **self.requests_kwargs)
-            response = Response(response)
-
-        if save_cached:
-            self.save_cached(response, expire_time=self.__class__.cached_expire_time)
-        log.info("requests",extra={"url":response.url,"code":response.status_code})
-        return response
-
-    def proxies(self):
-        """
-
-        Returns: {"https": "https://ip:port", "http": "http://ip:port"}
-
-        """
-        return self.requests_kwargs.get("proxies")
-
-    def proxy(self):
-        """
-
-        Returns: ip:port
-
-        """
-        proxies = self.proxies()
-        if proxies:
-            return proxies.get("http", "").strip("http://") or proxies.get(
-                "https", ""
-            ).strip("https://")
-
-    def user_agent(self):
-        headers = self.requests_kwargs.get("headers")
-        if headers:
-            return headers.get("user_agent") or headers.get("User-Agent")
-
-    @property
-    def fingerprint(self):
-        """
-        request唯一表识
-        @return:
-        """
-        url = self.__dict__.get("url", "")
-        # url 归一化
-        url = tools.canonicalize_url(url)
-        args = [url]
-
-        for arg in ["params", "data", "files", "auth", "cert", "json"]:
-            if self.requests_kwargs.get(arg):
-                args.append(self.requests_kwargs.get(arg))
-
-        return tools.get_md5(*args)
-
-    @property
-    def _cache_db(self):
-        if not self.__class__.cache_db:
-            self.__class__.cache_db = RedisDB()  # .from_url(setting.pika_spider_1_uri)
-
-        return self.__class__.cache_db
-
-    @property
-    def _cached_redis_key(self):
-        if self.__class__.cached_redis_key:
-            return (
-                f"response_cached:{self.__class__.cached_redis_key}:{self.fingerprint}"
-            )
-        else:
-            return f"response_cached:test:{self.fingerprint}"
-
-    def save_cached(self, response, expire_time=1200):
-        """
-        使用redis保存response 用于调试 不用每回都下载
-        @param response:
-        @param expire_time: 过期时间
-        @return:
-        """
-
-        self._cache_db.strset(self._cached_redis_key, response.to_dict, ex=expire_time)
-
-    def get_response_from_cached(self, save_cached=True):
-        """
-        从缓存中获取response
-        注意:
-            属性值为空:
-                -raw : urllib3.response.HTTPResponse
-                -connection:requests.adapters.HTTPAdapter
-                -history
-
-            属性含义改变:
-                - request 由requests 改为Request
-        @param: save_cached 当无缓存 直接下载 下载完是否保存缓存
-        @return:
-        """
-        response_dict = self._cache_db.strget(self._cached_redis_key)
-        if not response_dict:
-            log.info("无response缓存  重新下载")
-            try:
-                response_obj = self.get_response(save_cached=save_cached)
-            except FunctionTimedOut:
-                log.info("请求超时")
-                log.info("requests", extra={"url": self.url, "code": 0})
-
-        else:
-            response_dict = eval(response_dict)
-            response_obj = Response.from_dict(response_dict)
-        return response_obj
-
-    def del_response_cached(self):
-        self._cache_db.clear(self._cached_redis_key)
-
-    @classmethod
-    def from_dict(cls, request_dict):
-        for key, value in request_dict.items():
-            if isinstance(value, bytes):  # 反序列化 如item
-                request_dict[key] = tools.loads_obj(value)
-
-        return cls(**request_dict)
-
-    def copy(self):
-        return self.__class__.from_dict(self.to_dict)

+ 2 - 1
FworkSpider/feapder/network/response.py

@@ -14,4 +14,5 @@ redis-py-cluster>=2.1.0
 cryptography>=3.3.2
 urllib3>=1.25.8
 loguru>=0.5.3
-influxdb>=5.3.1
+influxdb>=5.3.1
+func-timeout==4.3.5

+ 6 - 4
FworkSpider/feapder/setting.py

@@ -4,9 +4,9 @@ import os
 
 # redis 表名
 # 任务表模版
-TAB_REQUSETS = "{redis_key}:z_requsets"
+TAB_REQUESTS = "{redis_key}:z_requests"
 # 任务失败模板
-TAB_FAILED_REQUSETS = "{redis_key}:z_failed_requsets"
+TAB_FAILED_REQUESTS = "{redis_key}:z_failed_requests"
 # 数据保存失败模板
 TAB_FAILED_ITEMS = "{redis_key}:s_failed_items"
 # 爬虫状态表模版
@@ -78,6 +78,8 @@ WEBDRIVER = dict(
 
 # 爬虫启动时,重新抓取失败的requests
 RETRY_FAILED_REQUESTS = False
+# 爬虫启动时,重新入库失败的item
+RETRY_FAILED_ITEMS = False
 # 保存失败的request
 SAVE_FAILED_REQUEST = True
 # request防丢机制。(指定的REQUEST_LOST_TIMEOUT时间内request还没做完,会重新下发 重做)
@@ -111,11 +113,11 @@ USE_SESSION = False
 # 去重
 ITEM_FILTER_ENABLE = False  # item 去重
 ITEM_FILTER_SETTING = dict(
-    filter_type=1  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
+    filter_type=1  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、轻量去重(LiteFilter)= 4、集群去重(SwordFishFilter)= 5
 )
 REQUEST_FILTER_ENABLE = False  # request 去重
 REQUEST_FILTER_SETTING = dict(
-    filter_type=3,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
+    filter_type=3,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、 轻量去重(LiteFilter)= 4、集群去重(SwordFishFilter)= 5
     expire_time=2592000,  # 过期时间1个月
 )
 

+ 98 - 30
FworkSpider/feapder/templates/air_spider_template.tmpl

@@ -6,20 +6,22 @@ Created on {DATE}
 ---------
 @author: {USER}
 """
+import re
 import sys
 sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 import time
 from urllib.parse import urljoin
-
 import feapder
 from feapder.utils.tools import wechat_warning
+from untils.attachment import AttachmentDownloader
 import execjs
 from items.spider_item import DataBakItem, MgpListItem
 from feapder.db.mongodb import MongoDB
+from feapder.utils.log import log
 
 
 
-class ${spider_name}(feapder.Spider):
+class Details(feapder.Spider):
     _to_db = None
     db_name = 'mgp_list'
     send_list = []
@@ -32,51 +34,122 @@ class ${spider_name}(feapder.Spider):
 
     def start_requests(self):
         while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"${spider_name}"},sort={"failed":-1},limit=50)
+            data_lsit = self.to_db.find(self.db_name,{"parser_name":"${spider_name}"},limit=50)
             for item in data_lsit:
+                log.debug(item.get("item"))
                 request_params = item.get("request_params")
-
-                '''可自定义'''
-
-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
-                                      deal_detail=item.get("deal_detail"),**request_params,
-                                      callback=eval(item.get("parse")),base_info=item,proxies=item.get("proxies"))
-                self.to_db.delete(self.db_name,item)
+                is_join_html = item.get("is_join_html")          # 正文是否根据xpath拼接
+                extra_html = item.get("extra_html")              # 过滤无效内容
+                if item.get("proxies"):
+                    yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files_info=item.get("files"),
+                                          deal_detail=item.get("deal_detail"),is_join_html=is_join_html,extra_html=extra_html,
+                                          callback=eval(item.get("parse")),base_info=item,**request_params)
+                else:
+                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files_info=item.get("files"),
+                                          deal_detail=item.get("deal_detail"),is_join_html=is_join_html,extra_html=extra_html,
+                                          callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
+                self.to_db.delete(self.db_name, {"_id": item.get("_id")})
             break
 
     def detail_get(self,request,response):
-        '''需自定义解析规则'''
+
         items = request.item
         list_item = DataBakItem()
         for key in items:
             list_item.__setitem__(key,items[key])
+
         html = ''
-        # for xpath in request.deal_detail:
-        #    html = response.xpath(xpath).extract_first()
-        #    if html is not None:
-        #        break
+        for xpath in request.deal_detail:
+            html = response.xpath(xpath).extract_first()  # 标书详细内容
+            if request.is_join_html:
+                if html is not None:
+                    html += html
+            else:
+                if html is not None:
+                    break
+
+        extra_html_info = request.extra_html
+        if html and extra_html_info:
+            for extra_item in extra_html_info:
+                if re.search('^//.*', extra_item):
+                    extra_html = response.xpath(extra_item).extract_first()
+                else:
+                    extra_html = extra_item
+                html = html.replace(extra_html,'')
 
         list_item.contenthtml = html
-        # if request.files:
-        #     files_info = request.files
-        #     files =  response.xpath(files_info.get("xpath")).extract()
-        #     for file_url in files:
-        #         if files_info.get("host"):
-        #             file_url = urljoin(files_info.get("host"), file_url)
-        #         if file_url.split(".")[-1] in files.get("other_files"):
-        #             continue
+
+        if request.files_info:      # 附件下载
+            files_info = request.files_info
+            files = response.xpath(files_info.get("list_xpath"))
+            if len(files)>0:
+                attachments = {}
+                for info in files:
+                    file_url = info.xpath(files_info.get("url_xpath")).extract_first()
+                    file_name = info.xpath(files_info.get("name_xpath")).extract_first()
+                    if not file_name:
+                        file_name = info.xpath(files_info.get("name_xpath")).extract()
+                    if file_name:
+                        file_name = "".join("".join(file_name).split()).strip()
+                        if files_info.get("host"):
+                            file_url = urljoin(files_info.get("host"), file_url)
+                        if not files_info.get("file_type"):
+                            file_type = file_url.split("?")[0].split(".")[-1].lower()
+                            if file_type not in files_info.get("files_type"):
+                                file_type = file_name.split("?")[0].split(".")[-1].lower()
+
+                        if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
+                            attachment = AttachmentDownloader().fetch_attachment(
+                                file_name=file_name,file_type=file_type,download_url=file_url,
+                                enable_proxy=False)
+                            attachments[str(len(attachments)+1)] = attachment
+                if len(attachments)==0:
+                    pass
+                else:
+                    list_item.projectinfo={"attachments":attachments}
+
         yield list_item
 
 
+    def detail_json(self,request,response):
+
+        items = request.item
+        list_item = DataBakItem()
+        for key in items:
+            list_item.__setitem__(key,items[key])
+
+        exec(request.deal_detail)
+
+        yield list_item
+
 
     def failed_request(self, request, response):
         '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
+        if response is None:
+            code = 0
+        else:
+            code = response.status_code
+        err_dic = {"200":"analysis","400":"download","500":"servers","300":"download"}
+        if 200<=code<300:
+            err = 'analysis'
+        elif 300<=code<400:
+            err = 'download'
+        elif 400<=code<500:
+            err = 'download'
+        elif 500<=code:
+            err = "servers"
+        else:
+            err = "timeout"
         mgp = MgpListItem()
+        mgp.code=code
+        mgp.error=err
         items = request.base_info
         for key in items:
             mgp.__setitem__(key,items[key])
         mgp.failed +=1
-        print(f'......{mgp.failed}')
+        if mgp.pri is None:
+            mgp.pri = 0
+
         if mgp.pri > 5:
             if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
                 if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
@@ -96,10 +169,5 @@ class ${spider_name}(feapder.Spider):
         yield mgp
 
 
-    def end_callback(self):
-        print("爬虫结束")
-
-
-
 if __name__ == "__main__":
-    Details(redis_key="fwork:details1").start()
+    Details(redis_key="{USER}:${spider_name}").start()

+ 61 - 45
FworkSpider/feapder/templates/project_template/CHECK_DATA.md

@@ -9,80 +9,96 @@ Created on {DATE}
 import sys
 sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 import feapder
-from items.spider_item import DataBakItem,MgpListItem,ListItem
+from items.spider_item import DataBakItem,MgpListItem
 from feapder.dedup import Dedup
 from collections import namedtuple
 
 
+
 class ${spider_name}(feapder.Spider):
 
     def start_callback(self):
+
+         self.site = ""
+
+         #               --- --- crawl_page 必须存在,且为纯数字(int) --- ---
          Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
 
          self.menus = [
              Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
-             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "Notice", 1),
+             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
          ]
+
+         self.headers = {}
+
     def start_requests(self):
          for menu in self.menus:
-             for page in range(1,menu.crawl_page+1):
-                 start_url = f''
-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False)
+             start_url = ''
+             yield feapder.Request(url=start_url,item=menu._asdict(),page=1,real_page=0,proxies=False)
+
 
     def parse(self, request, response):
+        real_count = 0
         menu = request.item
         dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = []
+        info_list = response.xpath('')       # 数据结构为html
         for info in info_list:
-            href = ''
-            title = ''
-            create_time = ''
+            href = info.xpath('').extract_first().strip()
+            title = info.xpath('').extract_first().strip()
+            publish_time = info.xpath('').extract_first().strip()
 
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
+            area = ""   # 省份
+            city = ""   # 城市
+
+            data_item = DataBakItem()                # 存储数据的管道
+            data_item.href = href                    # 标书链接
             data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
             data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = "*******记得编辑平台名称"
-            data_item.area = "全国"  # 城市默认:全国
-            data_item.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
+            data_item.title = title                  # 标题
+            data_item.publishtime = publish_time     # 标书发布时间
+            data_item.site = self.site
+            data_item.area = area or "全国"           # 省份 默认:全国
+            data_item.city = city                    # 城市 默认 为空
+
+            undedup_data = dedup.filter_exist_data([href])    # 去重
+            if undedup_data == []:
                 continue
+
             list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
+            list_item.parse = "self.detail_get"      # 详情页回调方法
+            list_item.parser_name = "details"        # 详情页标识 默认通用详情页
             list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//div[@class="****"]',"*****"]
+            list_item.deal_detail = ['//div[@class="****"]']   # 抽取正文xpath
             list_item.proxies = False
-            list_item.parse_url = href
-            list_item.pri = 1
-            list.files={
-                "list_xpath":'//div[@class="notice-foot"]/a',
+            list_item.parse_url = href               # 详情页请求地址
+            list_item.pri = 1                        # 执行等级
+
+            list_item.files={                        # 附件采集规则
+                "list_xpath":'//div[@class="***"]//a[@href]',
                 "url_xpath":'./@href',
                 "name_xpath":'./text()',
-                "files_type":('zip','doxc','ftp'),
-                "file_type":'zip',
-                "url_key":'attachmentDownload',
-                # "host":'http',
-                "kwargs":{"headers": {
-                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
-                }}
-            href_list.append(href)
+                "files_type":('zip','docx','ftp','pdf','doc','rar','gzzb',
+                              'jpg','png','zbid','xls','xlsx','swp','dwg'), # 需要下载的附件类型
+                #"file_type":'pdf',                  # 默认的附件类型,用于url中未带附件类型的
+                "url_key":'http',                    # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
+                "host":'',                           # 需要拼接url的host
+            }
+
+            dedup.add(href)
             yield list_item
-        list = ListItem()
-        list.site = self.site
-        list.channel = menu.get("channel")
-        list.spidercode = menu.get("code")
-        list.url = request.url
-        list.count = len(info_list)
-        list.rel_count = len(href_list)
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
+            real_count += 1
+
+
+
+        # 无限翻页
+
+        request = self.infinite_pages(request,response)
+        yield request
+
+    def download_midware(self, request):
+        page = request.page
+        request.headers = self.headers
+
 
 if __name__ == "__main__":
     ${spider_name}(redis_key="{USER}:${spider_name}").start()

+ 0 - 177
FworkSpider/feapder/utils/__init__.py

@@ -1,177 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021/3/18 12:39 上午
----------
-@summary:  阿里云附件上传
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import hashlib
-import os
-import traceback
-import oss2
-import requests
-from feapder import setting
-import time
-
-class UploadOSS:
-    """阿里云 oss"""
-
-    def __init__(self):
-        oss_conf = setting.oss_
-        self.file_path: str = ""
-        self.file_stream: bytes = b''
-        self.__acc_key_id = oss_conf['key_id']
-        self.__acc_key_secret = oss_conf['key_secret']
-        self.__endpoint = oss_conf['endpoint']
-        self.__bucket_name = oss_conf['bucket_name']
-
-    @property
-    def fid(self):
-        """
-        文本摘要值
-
-        @return: 十六进制摘要值
-        """
-        sha1 = hashlib.sha1()
-        sha1.update(str(self.file_stream).encode("utf-8"))
-        return sha1.hexdigest()
-
-    @property
-    def file_size(self):
-        """
-        文件的大小,将字节(bytes)转化(kb/M/G单位)
-
-        @return: 文件大小
-        """
-        try:
-            size = os.path.getsize(self.file_path)
-        except Exception:
-            traceback.print_exc()
-        else:
-            try:
-                _kb = float(size) / 1024
-            except:
-                return "Error"
-            else:
-                if _kb >= 1024:
-                    _M = _kb / 1024
-                    if _M >= 1024:
-                        _G = _M / 1024
-                        return "{:.1f} G".format(_G)
-                    else:
-                        return "{:.1f} M".format(_M)
-                else:
-                    return "{:.1f} kb".format(_kb)
-
-    def get_state(self, attachment,count=0, **kwargs):
-        """
-        下载附件并上传阿里oss
-
-        @param attachment: 附件
-        @return: 附件处理结果
-        """
-        request_params = {
-            'headers': setting.headers,
-            'timeout': 20,
-            'stream': True,
-            **kwargs
-        }
-        with requests.get(attachment["org_url"], **request_params) as req:
-            if req.status_code == 200:
-                self.file_stream = req.content
-                # img_dir = "file"
-                img_dir = f"file/{attachment['channel']}"
-                # 文件夹不存在则创建文件夹
-                if not os.path.exists(img_dir):
-                    os.makedirs(img_dir, mode=0o777, exist_ok=True)
-                # 打开目录,放入下载的附件
-                filname = hashlib.md5(attachment["filename"].encode("utf-8"))
-                filname = filname.hexdigest() #加密1次
-                types = attachment["ftype"]
-                self.file_path = "{}/{}".format(img_dir, filname+'.'+types)
-                with open(self.file_path, 'wb') as f:
-                    f.write(self.file_stream)
-                # 上传附件
-                self.put_oss_from_local()
-                file_state = self.file_state(attachment)
-                # 删除附件
-                os.remove(self.file_path)
-                # 返回附件上传处理信息
-                return file_state
-            else:
-                if count<3:
-                    self.post_state(attachment,count=count+1, **kwargs)
-                else:
-                    # attachment["ftype"] = str(attachment["filename"]).split(".")[1]
-                    attachment["url"] = 'oss'
-                    attachment["fid"] = self.fid + "." + attachment["ftype"]
-                    attachment["size"] = '0kb'
-                    attachment["false"] = True
-                    return attachment
-    def post_state(self, attachment,count=0, **kwargs):
-        """
-        下载附件并上传阿里oss
-
-        @param attachment: 附件
-        @return: 附件处理结果
-        """
-        request_params = {
-            'headers': setting.headers,
-            'timeout': 20,
-            'stream': True,
-            **kwargs
-        }
-        with requests.post(attachment["org_url"], **request_params) as req:
-            if req.status_code == 200:
-                self.file_stream = req.content
-                img_dir = f"file/{attachment['channel']}"
-                # 文件夹不存在则创建文件夹
-                if not os.path.exists(img_dir):
-                    os.makedirs(img_dir, mode=0o777, exist_ok=True)
-                # 打开目录,放入下载的附件
-                filname = hashlib.md5(attachment["filename"].encode("utf-8"))
-                filname = filname.hexdigest()  # 加密1次
-                types = attachment["ftype"]
-                self.file_path = "{}/{}".format(img_dir, filname + '.' + types)
-
-                with open(self.file_path, 'wb') as f:
-                    f.write(self.file_stream)
-                # 上传附件
-                self.put_oss_from_local()
-                file_state = self.file_state(attachment)
-                # 删除附件
-                # os.remove(self.file_path)
-                # 返回附件上传处理信息
-                return file_state
-            else:
-                if count<3:
-                    self.post_state(attachment,count=count+1, **kwargs)
-                else:
-                    attachment["url"] = 'oss'
-                    attachment["fid"] = self.fid + "." + attachment["ftype"]
-                    attachment["size"] = '0kb'
-                    attachment["false"] = True
-                    return attachment
-
-    def put_oss_from_local(self):
-        """上传一个本地文件到阿里OSS的普通文件"""
-        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
-        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
-        bucket.put_object_from_file(self.fid, self.file_path)
-
-    def file_state(self, attachment):
-        """
-        文件信息
-
-        @param attachment: 附件
-        @return: 附件上传处理信息
-        """
-        # attachment["ftype"] = str(attachment["filename"]).split(".")[1]
-        attachment["url"] = 'oss'
-        attachment["fid"] = self.fid + "." + attachment["ftype"]
-        attachment["size"] = self.file_size
-        return attachment
-
-

+ 1 - 1
FworkSpider/feapder/utils/custom_argparse.py

@@ -2,7 +2,7 @@
 """
 Created on 2020/2/19 12:57 PM
 ---------
-@summary: 邮件发送
+@summary:
 ---------
 @author: Boris
 @email: boris_liu@foxmail.com

+ 53 - 41
FworkSpider/feapder/utils/js/stealth.min.js

@@ -10,11 +10,10 @@ Created on 2018-12-08 16:50
 import logging
 import os
 import sys
-import time
 from logging.handlers import BaseRotatingHandler
 
+import logstash
 import loguru
-import pymongo
 from better_exceptions import format_exception
 
 import feapder.setting as setting
@@ -41,47 +40,46 @@ class RotatingFileHandler(BaseRotatingHandler):
         self.max_bytes = max_bytes
         self.backup_count = backup_count
         self.placeholder = str(len(str(backup_count)))
-        self._to_db = None
-        self.filename = filename
-
-
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = pymongo.MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
-
-        return self._to_db.pyspider
 
+    def doRollover(self):
+        if self.stream:
+            self.stream.close()
+            self.stream = None
+        if self.backup_count > 0:
+            for i in range(self.backup_count - 1, 0, -1):
+                sfn = ("%0" + self.placeholder + "d.") % i  # '%2d.'%i -> 02
+                sfn = sfn.join(self.baseFilename.split("."))
+                # sfn = "%d_%s" % (i, self.baseFilename)
+                # dfn = "%d_%s" % (i + 1, self.baseFilename)
+                dfn = ("%0" + self.placeholder + "d.") % (i + 1)
+                dfn = dfn.join(self.baseFilename.split("."))
+                if os.path.exists(sfn):
+                    # print "%s -> %s" % (sfn, dfn)
+                    if os.path.exists(dfn):
+                        os.remove(dfn)
+                    os.rename(sfn, dfn)
+            dfn = (("%0" + self.placeholder + "d.") % 1).join(
+                self.baseFilename.split(".")
+            )
+            if os.path.exists(dfn):
+                os.remove(dfn)
+            # Issue 18940: A file may not have been created if delay is True.
+            if os.path.exists(self.baseFilename):
+                os.rename(self.baseFilename, dfn)
+        if not self.delay:
+            self.stream = self._open()
 
     def shouldRollover(self, record):
-        parmars = {
-            "spider_name":record.name,
-            "msg":record.msg,
-            "Message":str(record.getMessage)
-        }
-        if record.levelname == "ERROR":
-            crawl_type = 'list'
-            if 'detail' in record.name:
-                crawl_type = 'detail'
-            url = ''
-            item={
-                "recordname":record.name,
-                "spidercode":"spidercode",
-                "author":self.filename,
-                "account":"",
-                "crawl_time":time.time(),
-                "crawl_type": crawl_type,
-                "status_code":"status_code",
-                "url":url,
-                "reason":record.msg,
-                'parmars': parmars,
-            }
-
-            # print('<<<<<<<<<<<<<<<<<<<<<<<插入error_info')
-            # print(item)
-            # print(self.to_db.error_info)
-            # self.to_db.error_info.insert_one(item)
 
+        if self.stream is None:  # delay was set...
+            self.stream = self._open()
+        if self.max_bytes > 0:  # are we rolling over?
+            # print('record >>>> ', record)
+            msg = "%s\n" % self.format(record)
+            self.stream.seek(0, 2)  # due to non-posix-compliant Windows feature
+            if self.stream.tell() + len(msg) >= self.max_bytes:
+                return 1
+        return 0
 
 
 def get_logger(
@@ -90,6 +88,7 @@ def get_logger(
     log_level=None,
     is_write_to_console=None,
     is_write_to_file=None,
+    is_send_to_logstash = None,
     color=None,
     mode=None,
     max_bytes=None,
@@ -113,6 +112,7 @@ def get_logger(
     @result:
     """
     # 加载setting里最新的值
+    # name = os.path.split(os.getcwd())[-1]
     name = name or setting.LOG_NAME
     path = path or setting.LOG_PATH
     log_level = log_level or setting.LOG_LEVEL
@@ -126,6 +126,13 @@ def get_logger(
         if is_write_to_file is not None
         else setting.LOG_IS_WRITE_TO_FILE
     )
+
+    is_send_to_logstash = (
+        is_send_to_logstash
+        if is_send_to_logstash is not None
+        else setting.LOG_IS_SEND_TO_LOGSTASH
+    )
+
     color = color if color is not None else setting.LOG_COLOR
     mode = mode or setting.LOG_MODE
     max_bytes = max_bytes or setting.LOG_MAX_BYTES
@@ -144,8 +151,8 @@ def get_logger(
 
     # 定义一个RotatingFileHandler,最多备份5个日志文件,每个日志文件最大10M
     if is_write_to_file:
-        # if path and not os.path.exists(os.path.dirname(path)):
-        #     os.makedirs(os.path.dirname(path))
+        if path and not os.path.exists(os.path.dirname(path)):
+            os.makedirs(os.path.dirname(path))
 
         rf_handler = RotatingFileHandler(
             path,
@@ -156,11 +163,16 @@ def get_logger(
         )
         rf_handler.setFormatter(formatter)
         logger.addHandler(rf_handler)
+
+    if is_send_to_logstash:
+        logger.addHandler(logstash.TCPLogstashHandler(setting.LOGSTASH_IP, setting.LOGSTASH_PORT, version=1))
+
     if color and is_write_to_console:
         loguru_handler = InterceptHandler()
         loguru_handler.setFormatter(formatter)
         # logging.basicConfig(handlers=[loguru_handler], level=0)
         logger.addHandler(loguru_handler)
+
     elif is_write_to_console:
         stream_handler = logging.StreamHandler()
         stream_handler.stream = sys.stdout

+ 14 - 8
FworkSpider/feapder/utils/metrics.py

@@ -17,13 +17,16 @@ from feapder.utils.log import log
 class RedisLock:
     redis_cli = None
 
-    def __init__(self, key, redis_cli=None, wait_timeout=0, lock_timeout=86400):
+    def __init__(
+        self, key, *, wait_timeout=0, lock_timeout=86400, redis_cli=None, redis_url=None
+    ):
         """
         redis超时锁
         :param key: 存储锁的key redis_lock:[key]
-        :param redis_cli: redis客户端对象
         :param wait_timeout: 等待加锁超时时间,为0时则不等待加锁,加锁失败
         :param lock_timeout: 锁超时时间 为0时则不会超时,直到锁释放或意外退出,默认超时为1天
+        :param redis_cli: redis客户端对象
+        :param redis_url: redis连接地址,若redis_cli传值,则不使用redis_url
 
         用法示例:
         with RedisLock(key="test") as _lock:
@@ -32,6 +35,7 @@ class RedisLock:
                 # do somethings
         """
         self.redis_conn = redis_cli
+        self.redis_url = redis_url
         self.lock_key = "redis_lock:{}".format(key)
         # 锁超时时间
         self.lock_timeout = lock_timeout
@@ -43,21 +47,23 @@ class RedisLock:
     @property
     def redis_conn(self):
         if not self.__class__.redis_cli:
-            self.__class__.redis_cli = RedisDB().get_redis_obj()
+            self.__class__.redis_cli = RedisDB(url=self.redis_url).get_redis_obj()
 
         return self.__class__.redis_cli
 
     @redis_conn.setter
     def redis_conn(self, cli):
-        self.__class__.redis_cli = cli
+        if cli:
+            self.__class__.redis_cli = cli
 
     def __enter__(self):
         if not self.locked:
             self.acquire()
-            # 延长锁的时间
-            thread = threading.Thread(target=self.prolong_life)
-            thread.setDaemon(True)
-            thread.start()
+            if self.locked:
+                # 延长锁的时间
+                thread = threading.Thread(target=self.prolong_life)
+                thread.setDaemon(True)
+                thread.start()
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):

+ 37 - 32
FworkSpider/feapder/utils/tools.py

@@ -7,6 +7,7 @@ Created on 2018-09-06 14:21
 @author: Boris
 @email: boris_liu@foxmail.com
 """
+
 import asyncio
 import calendar
 import codecs
@@ -37,6 +38,7 @@ from pprint import pprint
 from urllib import request
 from urllib.parse import urljoin
 
+import bson
 import execjs  # pip install PyExecJS
 import redis
 import requests
@@ -45,8 +47,10 @@ from requests.cookies import RequestsCookieJar
 from w3lib.url import canonicalize_url as _canonicalize_url
 
 import feapder.setting as setting
+from feapder.db.redisdb import RedisDB
 from feapder.utils.email_sender import EmailSender
 from feapder.utils.log import log
+
 os.environ["EXECJS_RUNTIME"] = "Node"  # 设置使用node执行js
 
 # 全局取消ssl证书验证
@@ -61,18 +65,11 @@ redisdb = None
 def get_redisdb():
     global redisdb
     if not redisdb:
-        ip, port = setting.REDISDB_IP_PORTS.split(":")
-        redisdb = redis.Redis(
-            host=ip,
-            port=port,
-            db=setting.REDISDB_DB,
-            password=setting.REDISDB_USER_PASS,
-            decode_responses=True,
-        )  # redis默认端口是6379
+        redisdb = RedisDB()
     return redisdb
 
 
-# 装饰器 -- 单例模式
+# 装饰器
 class Singleton(object):
     def __init__(self, cls):
         self._cls = cls
@@ -598,20 +595,8 @@ def get_form_data(form):
     return data
 
 
-# mac上不好使
-# def get_domain(url):
-#     domain = ''
-#     try:
-#         domain = get_tld(url)
-#     except Exception as e:
-#         log.debug(e)
-#     return domain
-
-
 def get_domain(url):
-    proto, rest = urllib.parse.splittype(url)
-    domain, rest = urllib.parse.splithost(rest)
-    return domain
+    return urllib.parse.urlparse(url).netloc
 
 
 def get_index_url(url):
@@ -823,27 +808,31 @@ def jsonp2json(jsonp):
         raise ValueError("Invalid Input")
 
 
-def dumps_json(json_, indent=4, sort_keys=False):
+def dumps_json(data, indent=4, sort_keys=False):
     """
     @summary: 格式化json 用于打印
     ---------
-    @param json_: json格式的字符串或json对象
+    @param data: json格式的字符串或json对象
     ---------
     @result: 格式化后的字符串
     """
     try:
-        if isinstance(json_, str):
-            json_ = get_json(json_)
-
-        json_ = json.dumps(
-            json_, ensure_ascii=False, indent=indent, skipkeys=True, sort_keys=sort_keys
+        if isinstance(data, str):
+            data = get_json(data)
+
+        data = json.dumps(
+            data,
+            ensure_ascii=False,
+            indent=indent,
+            skipkeys=True,
+            sort_keys=sort_keys,
+            default=str,
         )
 
     except Exception as e:
-        log.error(e)
-        json_ = pformat(json_)
+        data = pformat(data)
 
-    return json_
+    return data
 
 
 def get_json_value(json_object, key):
@@ -2552,3 +2541,19 @@ def ensure_float(n):
     if not n:
         return 0.0
     return float(n)
+
+
+def ensure_int64(n):
+    """
+    >>> ensure_int64(None)
+    0
+    >>> ensure_float(False)
+    0
+    >>> ensure_float(12)
+    12
+    >>> ensure_float("72")
+    72
+    """
+    if not n:
+        return bson.int64.Int64(0)
+    return bson.int64.Int64(n)

+ 167 - 72
FworkSpider/feapder/utils/webdriver.py

@@ -1,20 +1,23 @@
 # -*- coding: utf-8 -*-
 """
-Created on 2021/3/18 4:59 下午
+Created on 2023-03-01
 ---------
-@summary:
+@summary: 远程selenium服务
 ---------
-@author: Boris
-@email: boris_liu@foxmail.com
+@author: dzr
+@email: dongzhaorui@topnet.net.cn
 """
 
+import os
 import queue
 import threading
-import os
+
 from selenium import webdriver
-from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+from selenium.webdriver.chrome.remote_connection import ChromeRemoteConnection
+from selenium.webdriver.firefox.remote_connection import FirefoxRemoteConnection
 from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
 
+from feapder.setting import WEBDRIVER
 from feapder.utils.log import log
 from feapder.utils.tools import Singleton
 
@@ -22,9 +25,8 @@ DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit
 
 
 class WebDriver(RemoteWebDriver):
-    '''浏览器采集 - selenium'''
+    """浏览器采集 - selenium"""
     CHROME = "CHROME"
-    PHANTOMJS = "PHANTOMJS"
     FIREFOX = "FIREFOX"
 
     def __init__(
@@ -32,25 +34,33 @@ class WebDriver(RemoteWebDriver):
         load_images=True,
         user_agent=None,
         proxy=None,
-        headless=False,
-        driver_type=CHROME,
-        timeout=16,
+        driver_type=FIREFOX,
+        timeout=10,
         window_size=(1024, 800),
-        executable_path=None,
+        server_addr=None,
         custom_argument=None,
+        version=None,
+        usages_local_driver=True,
+        headless=False,
+        executable_path=None,
+        service_log_path=None,
         **kwargs
     ):
         """
-        webdirver 封装,支持chrome、phantomjs 和 firefox
+        webdirver 封装,支持 chrome 和 firefox
         Args:
             load_images: 是否加载图片
             user_agent: 字符串 或 无参函数,返回值为user_agent
             proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
             headless: 是否启用无头模式
-            driver_type: CHROME 或 PHANTOMJS,FIREFOX
+            driver_type: CHROME 或 FIREFOX...
             timeout: 请求超时时间
             window_size: # 窗口大小
             executable_path: 浏览器路径,默认为默认路径
+            server_addr: 远程服务地址
+            usages_local_driver: 使用本地驱动
+            service_log_path: selenium service 日志路径
+            version: 浏览器版本
             **kwargs:
         """
         self._load_images = load_images
@@ -59,18 +69,16 @@ class WebDriver(RemoteWebDriver):
         self._headless = headless
         self._timeout = timeout
         self._window_size = window_size
-        self._executable_path = executable_path
+        self._server_addr = server_addr or WEBDRIVER["server_addr"]
         self._custom_argument = custom_argument
-
-        self.proxies = {}
-        self.user_agent = None
+        self._version = version or WEBDRIVER["version"]
+        self._executable_path = executable_path
+        self._usages_local_driver = usages_local_driver
+        self._service_log_path = service_log_path
 
         if driver_type == WebDriver.CHROME:
             self.driver = self.chrome_driver()
 
-        elif driver_type == WebDriver.PHANTOMJS:
-            self.driver = self.phantomjs_driver()
-
         elif driver_type == WebDriver.FIREFOX:
             self.driver = self.firefox_driver()
 
@@ -93,30 +101,30 @@ class WebDriver(RemoteWebDriver):
         if exc_val:
             log.error(exc_val)
 
-        self.quit()
-        return True
+        self.get_driver().quit()
+        return False
 
     def get_driver(self):
         return self.driver
 
-    def firefox_driver(self):
+    def local_firefox_driver(self):
         firefox_profile = webdriver.FirefoxProfile()
         firefox_options = webdriver.FirefoxOptions()
         firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
-        firefox_profile.set_preference("dom.webdriver.enabled",False)
+        firefox_profile.set_preference("dom.webdriver.enabled", False)
         if self._proxy:
             proxy = self._proxy() if callable(self._proxy) else self._proxy
-            proxy = proxy.replace("socks5://","")
+            proxy = proxy.replace("socks5://", "")
             # 使用socks5 代理
             firefox_profile.set_preference('network.proxy.type', 1)  # 不使用代理:0, 使用代理:1
             firefox_profile.set_preference('network.proxy.socks', proxy.split(":")[0])
             firefox_profile.set_preference('network.proxy.socks_port', int(proxy.split(":")[-1]))
-            # firefox_capabilities["marionette"] = True  # http代理的使用
 
         if self._user_agent:
             firefox_profile.set_preference(
                 "general.useragent.override",
-                self._user_agent() if callable(self._user_agent) else self._user_agent,
+                self._user_agent() if callable(
+                    self._user_agent) else self._user_agent,
             )
 
         if not self._load_images:
@@ -137,12 +145,14 @@ class WebDriver(RemoteWebDriver):
                 options=firefox_options,
                 firefox_profile=firefox_profile,
                 executable_path=self._executable_path,
+                service_log_path=self._service_log_path
             )
         else:
             driver = webdriver.Firefox(
                 capabilities=firefox_capabilities,
                 options=firefox_options,
                 firefox_profile=firefox_profile,
+                service_log_path=self._service_log_path
             )
 
         if self._window_size:
@@ -150,20 +160,73 @@ class WebDriver(RemoteWebDriver):
 
         return driver
 
-    def chrome_driver(self):
+    def remote_firefox_driver(self):
+        firefox_capabilities = {
+            "browserName": "firefox",
+            "platform": "ANY",
+            "version": self._version,
+            "javascriptEnabled": True,
+            "marionette": False,
+        }
+        firefox_options = webdriver.FirefoxOptions()
+        firefox_options.add_argument("--disable-gpu")
+        firefox_options.set_preference("dom.webdriver.enabled", False)
+        if self._proxy:
+            proxy = self._proxy() if callable(self._proxy) else self._proxy
+            proxy = proxy.replace("socks5://", "")
+            # 使用socks5 代理
+            ip, port = proxy.split(":")
+            firefox_options.set_preference('network.proxy.type', 1)  # 不使用代理:0, 使用代理:1
+            firefox_options.set_preference('network.proxy.socks', ip)
+            firefox_options.set_preference('network.proxy.socks_port', int(port))
+            # firefox_capabilities["marionette"] = True  # http代理的使用
+
+        if self._user_agent:
+            firefox_options.set_preference(
+                "general.useragent.override",
+                self._user_agent() if callable(self._user_agent) else self._user_agent,
+            )
+
+        if not self._load_images:
+            firefox_options.set_preference("permissions.default.image", 2)
+
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                firefox_options.add_argument(arg)
+
+        executor = FirefoxRemoteConnection(remote_server_addr=self._server_addr)
+        browser = webdriver.Remote(
+            command_executor=executor,
+            desired_capabilities=firefox_capabilities,
+            options=firefox_options
+        )
+
+        if self._window_size:
+            browser.set_window_size(*self._window_size)
+
+        return browser
+
+    def firefox_driver(self):
+        if self._usages_local_driver:
+            return self.local_firefox_driver()
+        return self.remote_firefox_driver()
+
+    def remote_chrome_driver(self):
+        chrome_capabilities = {
+            "browserName": "chrome",
+            "platform": "ANY",
+            "version": self._version,
+            "javascriptEnabled": True,
+        }
         chrome_options = webdriver.ChromeOptions()
+
         # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
         chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
         chrome_options.add_experimental_option("useAutomationExtension", False)
         # docker 里运行需要
         chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-gpu")
 
-        if self._proxy:
-            chrome_options.add_argument(
-                "--proxy-server={}".format(
-                    self._proxy() if callable(self._proxy) else self._proxy
-                )
-            )
         if self._user_agent:
             chrome_options.add_argument(
                 "user-agent={}".format(
@@ -172,15 +235,19 @@ class WebDriver(RemoteWebDriver):
                     else self._user_agent
                 )
             )
+        # 不支持socks5协议
+        # if self._proxy:
+        #     chrome_options.add_argument(
+        #         "--proxy-server={}".format(
+        #             self._proxy() if callable(self._proxy) else self._proxy
+        #         )
+        #     )
+
         if not self._load_images:
             chrome_options.add_experimental_option(
                 "prefs", {"profile.managed_default_content_settings.images": 2}
             )
 
-        if self._headless:
-            chrome_options.add_argument("--headless")
-            chrome_options.add_argument("--disable-gpu")
-
         if self._window_size:
             chrome_options.add_argument(
                 "--window-size={},{}".format(self._window_size[0], self._window_size[1])
@@ -191,68 +258,95 @@ class WebDriver(RemoteWebDriver):
             for arg in self._custom_argument:
                 chrome_options.add_argument(arg)
 
-        if self._executable_path:
-            driver = webdriver.Chrome(
-                chrome_options=chrome_options, executable_path=self._executable_path
-            )
-        else:
-            driver = webdriver.Chrome(chrome_options=chrome_options)
+        browser = webdriver.Remote(
+            command_executor=ChromeRemoteConnection(
+                remote_server_addr=self._server_addr,
+                keep_alive=True),
+            desired_capabilities=chrome_capabilities,
+            options=chrome_options
+        )
 
         # 隐藏浏览器特征
         with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
             js = f.read()
-        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
-
-        return driver
-
-    def phantomjs_driver(self):
-        import warnings
+            params = {
+                'cmd': 'Page.addScriptToEvaluateOnNewDocument',
+                'params': {'source': js}
+            }
+            res = browser.execute("executeCdpCommand", params)['value']
 
-        warnings.filterwarnings("ignore")
+        return browser
 
-        service_args = []
-        dcap = DesiredCapabilities.PHANTOMJS
+    def local_chrome_driver(self):
+        chrome_options = webdriver.ChromeOptions()
+        # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
+        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+        chrome_options.add_experimental_option("useAutomationExtension", False)
+        # docker 里运行需要
+        chrome_options.add_argument("--no-sandbox")
 
         if self._proxy:
-            service_args.append(
-                "--proxy=%s" % self._proxy() if callable(self._proxy) else self._proxy
+            chrome_options.add_argument(
+                "--proxy-server={}".format(
+                    self._proxy() if callable(self._proxy) else self._proxy
+                )
             )
         if self._user_agent:
-            dcap["phantomjs.page.settings.userAgent"] = (
-                self._user_agent() if callable(self._user_agent) else self._user_agent
+            chrome_options.add_argument(
+                "user-agent={}".format(
+                    self._user_agent()
+                    if callable(self._user_agent)
+                    else self._user_agent
+                )
             )
         if not self._load_images:
-            service_args.append("--load-images=no")
+            chrome_options.add_experimental_option(
+                "prefs", {"profile.managed_default_content_settings.images": 2}
+            )
+
+        if self._headless:
+            chrome_options.add_argument("--headless")
+            chrome_options.add_argument("--disable-gpu")
+
+        if self._window_size:
+            chrome_options.add_argument(
+                "--window-size={},{}".format(self._window_size[0], self._window_size[1])
+            )
 
         # 添加自定义的配置参数
         if self._custom_argument:
             for arg in self._custom_argument:
-                service_args.append(arg)
+                chrome_options.add_argument(arg)
 
         if self._executable_path:
-            driver = webdriver.PhantomJS(
-                service_args=service_args,
-                desired_capabilities=dcap,
+            driver = webdriver.Chrome(
+                chrome_options=chrome_options,
                 executable_path=self._executable_path,
+                service_log_path=self._service_log_path
             )
         else:
-            driver = webdriver.PhantomJS(
-                service_args=service_args, desired_capabilities=dcap
+            driver = webdriver.Chrome(
+                chrome_options=chrome_options,
+                service_log_path=self._service_log_path
             )
 
-        if self._window_size:
-            driver.set_window_size(self._window_size[0], self._window_size[1])
-
-        del warnings
+        # 隐藏浏览器特征
+        with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
+            js = f.read()
+        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
 
         return driver
 
+    def chrome_driver(self):
+        if self._usages_local_driver:
+            return self.local_chrome_driver()
+        return self.remote_chrome_driver()
+
     @property
     def cookies(self):
         cookies_json = {}
         for cookie in self.driver.get_cookies():
             cookies_json[cookie["name"]] = cookie["value"]
-
         return cookies_json
 
     @cookies.setter
@@ -274,8 +368,9 @@ class WebDriver(RemoteWebDriver):
         else:
             raise AttributeError
 
-    def __del__(self):
-        self.quit()
+    # def __del__(self):
+    #     if self.driver:
+    #         self.driver.quit()
 
 
 @Singleton

+ 71 - 53
FworkSpider/items/__init__.py

@@ -1,27 +1,37 @@
-from feapder import Item
-from untils.tools import int2long,substitute,text_search,CheckPrePareRequest
-import time
+import feapder.utils.tools as tools
 from feapder.utils.log import log
-global xxc
-xxc = 0
-class DataNjpcItem(Item):
+from items.base_item import SwordFishProjectItem
+from untils.check_data import CheckData
+from untils.tools import int2long, substitute, text_search
+
+
+class DataNjpcItem(SwordFishProjectItem):
+    """拟建数据"""
     def __init__(self):
-        # 一类字段
-        self.href = ""  # 非竞品快照页地址
-        self.projectname = ""  # 项目名称
-        self.publishtime = ""  # 文章发布时间(日期格式 xxxx-xx-xx)
-        self.detail = ""  # 快照页源码清洗之后招投标文本
-        self.contentlhtml = ""  # 快照页源码
+        super(DataNjpcItem, self).__init__()
+
+        self.table_name = "data_bak"  # 拟建数据存储表名
+
         self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
         self.site = ""  # 采集的站点(编辑器爬虫平台定义)
         self.channel = ""  # 采集的版块(编辑器爬虫平台定义)
+
+        # 一类字段
+        self.href = ""  # 非竞品详情页地址
+        self.title = ""  # 标题
+        self.projectname = ""  # 项目名称
+        self.publishtime = ""  # 文章发布时间(时间戳),单位:秒
         self.area = "全国"  # 省
         self.city = ""  # 市
-        self.district = ""  # 区县
+        self.district = ""  # 区/县
+        self.contenthtml = ""  # 详情页源码
+        self.detail = ""  # 详情页源码清洗之后的文本
+        self.projectinfo = None  # 附件信息,详见剑鱼拟建规范
 
-        # 辅助字段 存储时的辅助字段
-        self.save = True  # 区县
-        self.sendflag = False
+        # 默认设置
+        self.sendflag = "false"
+        self.T = "bidding"
+        self.infoformat = 2
 
         # 以下字段为 二类字段,没有则不做存储,不在存储结构中
         # 附件,默认为Null 正确的格式为 projectinfo.attachments = [{
@@ -79,60 +89,68 @@ class DataNjpcItem(Item):
         # 施工单位联系人	constructionunitperson
         # 施工单位联系方式	constructionunittel
         # 施工单位地址	constructionunitaddr
+
     def pre_to_db(self):
-        # 生成入库时间戳(秒级), 定义为long型
-        self.comeintime = int2long(time.time())
-        # 根据文章发布时间 生成发布时间的时间戳(秒级), 定义为long型
-        '''
-        如果无法解析到发布时间、可以考虑补一个发布时间
-        '''
-        # if "-" in self.publishtime:
-        #     self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d %H:%M:%S"))))
-        # else:
-        #     self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d"))))
+        if not self.title:
+            self.title = self.projectname
+            log.debug("请检测 < title > 是否正确!")
+
+        self.comeintime = int2long(tools.get_current_timestamp())  # 生成入库时间戳(秒级)
 
         if "-" in str(self.publishtime) and ":" in str(self.publishtime):
-            self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d %H:%M:%S"))))
+            self.publishtime = int2long(tools.date_to_timestamp(self.publishtime))
         elif "-" in str(self.publishtime) and ":" not in str(self.publishtime):
-            self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d"))))
-        elif len(str(self.publishtime)) == 10 or len(str(self.publishtime)) == 13:
+            self.publishtime = int2long(tools.date_to_timestamp(self.publishtime, "%Y-%m-%d"))
+        elif len(str(self.publishtime)) == 10 or len(str(self.publishtime)) == 13: # 或许是时间戳
             self.publishtime = int2long(int(str(self.publishtime)[:10]))
         else:
-            raise ValueError("The publication time format is incorrect -> %r " %(self.publishtime))
+            raise ValueError("发布时间格式不正确 -> %r " %(self.publishtime))
 
-        # 数据获取失败处理:输出错误日志
         if not self.projectname or not self.publishtime or not self.href:
-            log.error(f"部分数据抓取失败,数据详情:\n 链接:{self.href}\n 发布时间:{self.publishtime}\n标题:{self.projectname}")
-            self.save=False
-        if self.contentlhtml is not None and self.detail =='':
-            self.detail = substitute(self.contentlhtml)
-            '''
-            detail:去头、去尾
-            '''
+            self.save = False
+            log.warning(f"基础数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.projectname}")
+
+        if not self.contenthtml:
+            self.save = False
+            log.warning(f"正文数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.projectname}")
+        else:
+            if not self.detail:
+                self.detail = substitute(self.contenthtml)
+
             if text_search(self.detail).total == 0:
-                # 无正文内容时,该内容直接标记true, 不在被统计、不入生产库
                 self.sendflag = "true"
 
-class NjpcListItem(Item):
+        if not self.projectinfo:
+            del self.projectinfo
+
+
+class NjpcListItem(SwordFishProjectItem):
+
     def __init__(self):
-        # 一类字段
-        self.href = ""  # 非竞品快照页地址
-        self.projectname = ""  # 项目名称
-        self.publishtime = ""  # 文章发布时间(日期格式 xxxx-xx-xx)
+        super(NjpcListItem, self).__init__()
+
         self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
         self.site = ""  # 采集的站点(编辑器爬虫平台定义)
         self.channel = ""  # 采集的版块(编辑器爬虫平台定义)
+
+        self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识,建议使用 spidercode 命名
+        self.parser_url = ""  # 详情页数据地址
+
+        self.comeintime = int2long(tools.get_current_timestamp())  # 入库时间
+
+        # 一类字段
+        self.href = ""  # 非竞品详情页地址
+        self.projectname = ""  # 项目名称
+        self.publishtime = ""  # 文章发布时间
         self.area = "全国"  # 省
         self.city = ""  # 市
-        self.district = ""  # 区县
-
-        # 辅助字段 存储时的辅助字段
-        self.save = True  # 区县
-        self.parser_name = ""  # 处理详情页爬虫的名称
-        self.parser_url = ""  # 处理详情页的url
-        self.failed = 0 #失败请求的计数
+        self.district = ""  # 区/县
 
+        self.request_params = {}  # 定义callback所需的参数,诸如render,headers,method,data,params等等,必须与requests请求的参数名称对应,否则无法识别
 
     def pre_to_db(self):
-        pass
-
+        if CheckData.channel(self.channel, group="njpc"):
+            code, reason = CheckData.title(self.projectname, group="njpc")
+            if code == 10106:
+                log.warning(f"{self.projectname}--不可入库,原因:{reason}")
+                self.save = False

+ 115 - 111
FworkSpider/items/spider_item.py

@@ -1,140 +1,144 @@
-from feapder import Item
-from untils.tools import int2long, substitute, text_search, CheckPrePareRequest, HtmlEmptyError
-import time
+import feapder.utils.tools as tools
 from feapder.utils.log import log
-from feapder.utils.tools import get_current_date
-from datetime import datetime
-import os
-from feapder import setting
-global xxc
-xxc = 0
+from items.base_item import SwordFishProjectItem
+from untils.check_data import CheckData
+from untils.tools import (
+    int2long,
+    substitute,
+    text_search,
+)
 
-class DataBakItem(Item):
 
+class DataBakItem(SwordFishProjectItem):
+    """标讯数据"""
     def __init__(self):
+        super(DataBakItem, self).__init__()
+
+        self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
+        self.site = ""  # 采集的站点(编辑器爬虫平台定义)
+        self.channel = ""  # 采集的版块(编辑器爬虫平台定义)
+
         self.title = ""  # 文章标题
-        self.publishtime = ""   # 文章发布时间(日期格式 xxxx-xx-xx)
-        self.spidercode = ""   # 爬虫代码(编辑器爬虫平台定义)
-        self.site = ""   # 采集的站点(编辑器爬虫平台定义)
-        self.channel = ""   # 采集的版块(编辑器爬虫平台定义)
-        self.area = "全国"   # 省
-        self.city = ""   # 市
-        self.competehref = None   # 竞品快照页地址
-        self.href = ""   # 非竞品快照页地址
-        self.publishdept = ""
-        self.iscompete=True
-        self.type = ""
-        self.T = "bidding"
+        self.s_title = ""  # 详情页标题(有必填),默认提供列表页标题
+        self.area = "全国"  # 省
+        self.city = ""  # 市
+        self.district = ""  # 区/县
+        self.publishtime = ""  # 文章发布时间(列表页或者详情页发布时间)
         self.l_np_publishtime = ""  # 发布时间的时间戳(秒级), 需定义为long型
         self.comeintime = ""  # 入库时间戳(秒级), 需定义为long型
+        self.contenthtml = ""  # 详情页源码
+        self.detail = ""  # 详情页源码清洗之后的文本
+
+        self.href = ""  # 非竞品详情页地址
+        self.competehref = None  # 竞品详情页地址
+        self.projectinfo = None  # 附件信息,详见剑鱼招投标规范
+
+        self.iscompete = True  # 新爬虫
+
         self.sendflag = "false"
+        self.T = "bidding"
+        self.infoformat = 1
+
+        # 默认设置
+        self.type = ""
+        self.publishdept = ""
         self._d = "comeintime"
-        self.contenthtml = ""  # 快照页源码
-        self.detail = ""  # 快照页源码清洗之后招投标文本
-        self.projectinfo = None  # 快照页源码清洗之后招投标文本
-        self.save = True
-    def stop(self):
-        self.save=False
-        raise HtmlEmptyError
 
     def pre_to_db(self):
-        # 生成入库时间戳(秒级), 定义为long型
-        self.comeintime = int2long(time.time())
-        # 根据文章发布时间 生成发布时间的时间戳(秒级), 定义为long型
-        '''如果无法解析到发布时间、可以考虑补一个发布时间
-        '''
+        if not self.s_title:
+            self.s_title = self.title
+            log.debug("请检测 < s_title > 是否正确!")
+
+        self.comeintime = int2long(tools.get_current_timestamp())  # 生成入库时间戳(秒级), 定义为long型
+
         if ":" in self.publishtime:
-            self.l_np_publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d %H:%M:%S"))))
+            self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime))
         else:
-            self.l_np_publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d"))))
+            self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime, "%Y-%m-%d"))
 
-        # 数据获取失败处理:输出错误日志
-        if self.contenthtml is None and self.projectinfo is None:
-            log.error(f"{self.href},此链接数据正文抓取失败")
-            # self.sendflag = "true"
-            self.stop()
-        if not self.title or not self.publishtime or not self.href:
-            # self.sendflag = "true"
-            log.error(f"部分数据抓取失败,数据详情:\n 链接:{self.href}\n 发布时间:{self.publishtime}\n标题:{self.title}")
-            self.stop()
         # html处理正文
-        if self.contenthtml is not None and self.detail =='':
-            self.detail = substitute(self.contenthtml)
-            '''
-            detail:去头、去尾
-            '''
+        if not self.contenthtml:
+            log.warning(f"正文数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.title}")
+            self.save = False
+        else:
+            if not self.detail:
+                self.detail = substitute(self.contenthtml)
+
             if text_search(self.detail).total == 0:
-                # 无正文内容时,该内容直接标记true, 不在被统计
-                self.sendflag = "true"
+                self.sendflag = "true"   # 无内容数据,数据不入保存服务
+
+        if not self.title or not self.publishtime or not self.href:
+            log.warning(f"基础数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.title}")
+            self.save = False
+
+        # 竞品网站-详情页地址标识字段
+        if not self.competehref:
+            del self.competehref
 
+        # 详情无附件,不需要 projectinfo 字段
+        if not self.projectinfo:
+            del self.projectinfo
 
-class MgpListItem(Item):
+
+class ExamineAndApproveItem(DataBakItem):
+    """审批数据"""
     def __init__(self):
-        # self.__table_name__='ggg_list'
-
-        self.parse = "" # 需要调用的方法名称
-        self.item = "" # 传过来的参数
-        self.parser_name = "" # 处理详情页的爬虫名
-        self.date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 当前日期时间
-        self.comeintime = int2long(int(time.time())) # 当前日期时间戳
-        self.deal_detail = [] # 定义解析详情页主页内容的解析,detail_get是一个xpath列表,detail_post 则是一段处理代码
-        self.create_time = None # 定义解析详情页发布时间的xpath,列表页无发布时间时应用
-        self.parse_url = "" # 定义解析详情页主页内容的xpath
-        self.request_params = {} # 定义callback所需的参数,诸如render,headers,method,data,params等等,
-                                # 必须与requests请求的参数名称对应,否则无法识别
-        self.failed = 0 #失败请求的计数
-        self.author = "开发及维护人员" # 开发及维护人员
-        self.ex_js = ''  # 定义需要执行的python代码时所需的参数、js_str、js文件路径 等
-        self.ex_python = None # 定义需要执行的python代码,生成params/date,如header和cookie特殊,最好使用特殊定义法
-        self.pri = 1 # 爬虫报警级 可分9级
-        self.proxies = True # 爬虫报警级 可分9级
-        self.files = False # 附件采集配置
-        self.error = None
-        self.spidercode = ""
-        self.save=True
-
-        # self.error_info =
-    def pre_to_db(self):
-        # 生成入库时间戳(秒级), 定义为long型
-        self.author = os.path.basename(os.getcwd())
-        self.spidercode = self.item.get("spidercode")
+        super(ExamineAndApproveItem, self).__init__()
 
-        if "通知公告" in self.item.get("channel"):
-            code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
-            if code == 10106:
-                log.error(f"{self.item.get('title')}----不可入库,失败原因:{reason}")
-        elif "公告公示" in self.item.get("channel"):
-            code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
-            if code == 10106:
-                log.error(f"{self.item.get('title')}----不可入库,失败原因:{reason}")
+        self.table_name = "data_bak"
+
+        self.T = "bidding"
+        self.infoformat = 2
+
+
+class PropertyRightItem(DataBakItem):
+    """产权数据"""
+    def __init__(self):
+        super(PropertyRightItem, self).__init__()
+
+        self.table_name = "data_bak"
+
+        self.T = "bidding_other"
+        self.infoformat = 3
 
-        global xxc
-        xxc += 1
 
-    def open_spider(self):
-        pass
+class MgpListItem(SwordFishProjectItem):
 
-class ListItem(Item):
     def __init__(self):
+        super(MgpListItem, self).__init__()
+
         self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
-        self.site = ""  # 采集的站点(编辑器爬虫平台定义)
-        self.channel = ""  # 采集的版块(编辑器爬虫平台定义)
-        self.url = ''
-        self.count=0
-        self.code=-1
-        self.rel_count = 0
-        self.save=True
 
-    def pre_to_db(self):
-        time.sleep(0.1)
-        self.author = setting.author.get(os.path.basename(os.getcwd()))
-        if self.author is None:
-            self.author = os.path.basename(os.getcwd())
-        self.runtime = get_current_date(date_format="%Y-%m-%d")
-        global xxc
-        print("xxc___________________",xxc)
-        self.rel_count = xxc
-        xxc = 0
+        self.parse_url = ""  # 详情爬虫访问地址
+        self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识,建议使用 spidercode 命名
+        self.parse = ""  # 详情爬虫解析回调方法名
+
+        self.request_params = {}  # 定义callback所需的参数,诸如render,headers,method,data,params等等,必须与requests请求的参数名称对应,否则无法识别
+        self.proxies = True  # 代理
+
+        self.comeintime = int2long(tools.get_current_timestamp())  # 入库时间
 
+        self.deal_detail = []  # 定义解析详情页主页内容的xpath列表
+        self.ex_js = ""  # 定义需要执行的js代码,包括但不限于script、文件路径等
+        self.ex_python = None  # 定义需要执行的python代码,生成params/date,如header和cookie特殊,最好使用特殊定义法
 
+        self.files = False  # 采集附件配置
 
+    @property
+    def item(self) -> dict:
+        return self.__dict__["item"]
+
+    @item.setter
+    def item(self, data_item: DataBakItem):
+        self.__dict__["item"] = data_item.to_dict
+
+    def pre_to_db(self):
+        self.spidercode = self.item["spidercode"]
+
+        title = self.item.get("title")
+        channel = self.item["channel"]
+        if CheckData.channel(channel):
+            code, reason = CheckData.title(title)
+            if code == 10106:
+                log.warning(f"{title}--不可入库,原因:{reason}")
+                self.save = False

+ 0 - 0
FworkSpider/login_pool/__init__.py


+ 0 - 95
FworkSpider/login_pool/zglbw.py

@@ -1,95 +0,0 @@
-
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-from untils.cookie_pool import LoginCookiePool
-import requests
-class ZglbwPool(LoginCookiePool):
-
-    def create_cookie(self, username, password):
-        print(username,password)
-        '''
-        https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
-        2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
-        
-        https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
-        2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
-        '''
-        session = requests.Session()
-        headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0"}
-        url = 'https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=9d424669-5af6-4b3d-bed5-56cc06bd5ca6'
-        data = {
-            "clear": "",
-            "BackURL": "null",
-            "username": username,
-            "password": password,
-            "jcaptchaCode": "shmt"
-        }
-        session.get(url,headers=headers)
-        session.post(url, data=data)
-        # print(res.headers)
-        ss = session.get(url='https://eproport.crecgec.com/getAuthentication')
-        print(ss.text)
-        cookies = requests.utils.dict_from_cookiejar(session.cookies)
-        print(cookies)
-        return cookies
-
-
-
-
-# cookie_pool = ZglbwPool(username_key='username', password_key="password", table_userbase='zglbw',
-#                               redis_key='zglbw')
-# # cookie_pool.create_cookie('zuoshang123',"123qwe!A")
-# # # res = requests.get('https://eproport.crecgec.com/getAuthentication',cookies=cookie)
-# # # print(res.text)
-# cookie_pool.del_cookie(cookie_pool.get_cookie())
-
-
-# def create_cookie():
-#     '''
-#     https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
-#     2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
-#
-#     https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
-#     2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
-#     '''
-#     session = requests.Session()
-#     url = 'https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&response_type=code'
-#     data = {
-#         "clear": "",
-#         "BackURL": "null",
-#         "username": "zuoshang123",
-#         "password": "123qwe!A",
-#         "jcaptchaCode": "shmt"
-#     }
-#     session.get(url)
-#     res = session.post(url, data=data)
-#
-# create_cookie()
-# # import requests
-#
-#
-#
-# # cookies = {
-# #     "srv_id": "53069e9fd596ee2f1c7cf21d24bd170e",
-# #     "uid": "e423da7f-1d30-4571-a011-429326f1cfd1",
-# #     "Hm_lvt_89c053c39b2269b8a37c5881ca224223": "1642647201",
-# #     "JSESSIONID": "752173C3FF0C519DB45BBF781CEC76CB",
-# #     "Hm_lpvt_89c053c39b2269b8a37c5881ca224223": "1642661696"
-# # }
-# # url = "https://passport.crecgec.com/authorize"
-# # params = {
-# #     "type": "cas",
-# #     "client_id": "10000000`53",
-# #     "response_type": "code"
-# # }
-# # data = {
-# #     "clear": "",
-# #     "BackURL": "null",
-# #     "username": "zuoshang123",
-# #     "password": "123qwe!A",
-# #     "jcaptchaCode": "shmt"
-# # }
-# # response = requests.post(url, headers=headers, cookies=cookies, params=params, data=data)
-# #
-# # print(response.text)
-# # print(response)

+ 0 - 56
FworkSpider/mongo_pipeline.py

@@ -1,56 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-04-18 14:12:21
----------
-@summary: 导出数据
----------
-@author: 马国鹏
-@email:  305021384@qq.com
-"""
-from typing import Dict, List, Tuple
-import time
-from feapder.db.redisdb import RedisDB
-from feapder.dedup import Dedup
-from feapder.pipelines import BasePipeline
-from feapder.utils.log import log
-from untils.tools import *
-
-
-
-class RedisPipeline(BasePipeline):
-    '''数据存储管道-redis版'''
-    def __init__(self):
-        self._to_db = None
-
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = RedisDB()
-            print("创建新连接?")
-
-        return self._to_db
-
-    def save_items(self, table, items: List[Dict]) -> bool:
-        """
-        保存数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-
-        Returns: 是否保存成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-        """
-        try:
-            add_count = self.to_db.lpush(table="savemongo:"+table, values=items)
-            print(add_count)
-            datas_size = len(items)
-            log.info(
-                "共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条"
-                % (datas_size, table, len(items), datas_size - len(items))
-            )
-
-            return True
-        except Exception as e:
-            log.exception(e)
-            return False
-

+ 0 - 98
FworkSpider/mongo_pipeline_old.py

@@ -1,98 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-04-18 14:12:21
----------
-@summary: 导出数据
----------
-@author: 马国鹏
-@email:  305021384@qq.com
-"""
-from typing import Dict, List, Tuple
-import time
-from feapder.db.mongodb import MongoDB
-from feapder.dedup import Dedup
-from feapder.pipelines import BasePipeline
-from feapder.utils.log import log
-from untils.tools import *
-# from crawlab import save_item
-
-
-
-class MongoPipeline(BasePipeline):
-    def __init__(self):
-        self._to_db = None
-
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-            print("创建新连接?")
-
-        return self._to_db
-
-    def save_items(self, table, items: List[Dict]) -> bool:
-        """
-        保存数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-
-        Returns: 是否保存成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-        """
-        try:
-            add_count = self.to_db.add_batch(coll_name=table, datas=items)
-            for item in items:
-                dedup = Dedup(Dedup.BloomFilter)
-                dedup.add([item.get("href")])
-                # save_item({'count':item.get("href")})
-            datas_size = len(items)
-            log.info(
-                "共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条"
-                % (datas_size, table, add_count, datas_size - add_count)
-            )
-            # wechat_warning(f"{site}  数据导报\n共插入 {datas_size} 条数据到 {table}")
-            # for i in range(add_count):
-            # if table == "mgp_list":
-            #     save_item({"site": "失败回填", "title": add_count})
-
-            return True
-        except Exception as e:
-            log.exception(e)
-            return False
-
-    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
-        """
-        更新数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-            update_keys: 更新的字段, 如 ("title", "publish_time")
-
-        Returns: 是否更新成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-
-        """
-        try:
-            # self.to_db.find()
-            add_count = self.to_db.add_batch(
-                coll_name=table,
-                datas=items,
-                update_columns=update_keys or list(items[0].keys()),
-            )
-            datas_size = len(items)
-            update_count = datas_size - add_count
-            msg = "共导出 %s 条数据到 %s,  新增 %s 条, 更新 %s 条" % (
-                datas_size,
-                table,
-                add_count,
-                update_count,
-            )
-            if update_keys:
-                msg += " 更新字段为 {}".format(update_keys)
-            log.info(msg)
-
-            return True
-        except Exception as e:
-            log.exception(e)
-            return False

+ 89 - 151
FworkSpider/setting.py

@@ -1,181 +1,119 @@
 # -*- coding: utf-8 -*-
 """爬虫配置文件"""
+import datetime
 import os
-import time
 import sys
-# from scoket_proxy import Socks5Proxy
-#
-# # MYSQL
-# MYSQL_IP = "localhost"
-# MYSQL_PORT = 3306
-# MYSQL_DB = ""
-# MYSQL_USER_NAME = ""
-# MYSQL_USER_PASS = ""
-#
+
 # MONGODB
-# MONGO_IP = "192.168.20.51"  # 本地 docker 环境
-MONGO_IP = "172.17.4.87"  # 线上环境
+MONGO_IP = "172.17.4.87"
 MONGO_PORT = 27080
-# MONGO_PORT = 27001
 MONGO_DB = "py_spider"
-# MONGO_USER_NAME = ""
-# MONGO_USER_PASS = ""
-#
-# # REDIS
-# # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
-# REDISDB_IP_PORTS = "192.168.20.51:6379"  # 本地 docker 环境
-REDISDB_IP_PORTS = "172.19.0.1:6379"  # 环境
-# REDISDB_USER_PASS = ""
+
+# REDIS
+# ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
+REDISDB_IP_PORTS = "172.17.4.232:7361"
+REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ"
 REDISDB_DB = 10
-# # 适用于redis哨兵模式
-REDISDB_SERVICE_NAME = "quchoong"  # 没用到
-#
-# # 数据入库的pipeline,可自定义,默认MysqlPipeline
+
+# 数据入库的pipeline,可自定义,默认RedisPipeline
 ITEM_PIPELINES = [
-    # "feapder.pipelines.mysql_pipeline.MysqlPipeline",
     # "feapder.pipelines.mongo_pipeline.MongoPipeline",
-    "mongo_pipeline.MongoPipeline"
+    "feapder.pipelines.swordfish.redis_pipeline.RedisPipeline"
 ]
-EXPORT_DATA_MAX_FAILED_TIMES = 5 # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
-EXPORT_DATA_MAX_RETRY_TIMES = 5 # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
-#
-# # 爬虫相关
-# # COLLECTOR
-# COLLECTOR_SLEEP_TIME = 1  # 从任务队列中获取任务到内存队列的间隔
-# COLLECTOR_TASK_COUNT = 10  # 每次获取任务数量
-#
-REDIS_KEY = "fwork" # 没用到
-# # SPIDER
-SPIDER_THREAD_COUNT = 1  # 爬虫并发数
-# SPIDER_SLEEP_TIME = [2, 5] # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数,包含2和5
-# SPIDER_TASK_COUNT = 1  # 每个parser从内存队列中获取任务的数量
-SPIDER_MAX_RETRY_TIMES = 5  # 每个请求最大重试次数
-# KEEP_ALIVE = False  # 爬虫是否常驻
-#
-# # 浏览器渲染
-WEBDRIVER  = dict(
+# 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
+EXPORT_DATA_MAX_FAILED_TIMES = 5
+# 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
+EXPORT_DATA_MAX_RETRY_TIMES = 5
+
+COLLECTOR_TASK_COUNT = 100  # 每次获取任务数量
+
+# 爬虫
+SPIDER_THREAD_COUNT = 1  # 爬虫并发数,追求速度推荐32
+SPIDER_MAX_RETRY_TIMES = 3  # 每个请求最大重试次数
+
+# 浏览器渲染
+WEBDRIVER = dict(
+    server_addr="http://172.17.4.232:6666/wd/hub",  # selenium 远程服务地址
+    version="",  # 浏览器版本。不指定版本时,随机分发,版本详见群公告
     pool_size=1,  # 浏览器的数量
     load_images=False,  # 是否加载图片
-    # user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
-    proxy=None,  # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
+    user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
     headless=True,  # 是否为无头浏览器
-    driver_type="FIREFOX",  # CHROME、PHANTOMJS、FIREFOX
+    proxy=None,  # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
+    driver_type="FIREFOX",  # CHROME、FIREFOX、EDGE
     timeout=30,  # 请求超时时间
+    executable_path=None,  # 浏览器路径,默认为默认路径
+    usages_local_driver=True,  # 是否使用本地驱动,默认启动本地驱动
     window_size=(1280, 800),  # 窗口大小
-    # executable_path="D:\\geckodriver.exe",  # 浏览器路径,默认为默认路径
     render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
     custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
+    service_log_path=os.devnull  # 日志路径,默认置空
 )
-#wget https://github.com/mozilla/geckodriver/releases/download/v0.25.0/geckodriver-v0.25.0-linux64.tar.gz
-# # 爬虫启动时,重新抓取失败的requests
-# RETRY_FAILED_REQUESTS = False
-# # 保存失败的request
-# SAVE_FAILED_REQUEST = True
-# # request防丢机制。(指定的REQUEST_LOST_TIMEOUT时间内request还没做完,会重新下发 重做)
-# REQUEST_LOST_TIMEOUT = 600  # 10分钟
-# # request网络请求超时时间
-# REQUEST_TIMEOUT = 22  # 等待服务器响应的超时时间,浮点数,或(connect timeout, read timeout)元组
-#
-# # 下载缓存 利用redis缓存,但由于内存大小限制,所以建议仅供开发调试代码时使用,防止每次debug都需要网络请求
-# RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
-# RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
-# RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
-#
-# # 设置代理
-PROXY_EXTRACT_API = "http://socks.spdata.jianyu360.com/socks/getips?limit=100"  # 代理提取API ,返回的代理分割符为\r\n
+# 爬虫启动时,重新入库失败的item
+RETRY_FAILED_ITEMS = True
+
+# 保存失败的request
+SAVE_FAILED_REQUEST = False
+
+# request网络请求超时时间
+REQUEST_TIMEOUT = 60
+
+# 调度器,存放item与request的根目录
+REDIS_KEY = "fwork"
+
+# 设置代理,代理提取API ,返回的代理分割符为\r\n
+PROXY_EXTRACT_API = "http://proxy.spdata.jianyu360.com/proxy/getallip"
 PROXY_ENABLE = True
-#
-# # 随机headers
-# RANDOM_HEADERS = True
-# # UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari','mobile' 若不指定则随机类型
-# USER_AGENT_TYPE = "chrome"
-# # 默认使用的浏览器头 RANDOM_HEADERS=True时不生效
-# DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
-# # requests 使用session
-# USE_SESSION = False
-#
-# # 去重
-# ITEM_FILTER_ENABLE = False  # item 去重
-# REQUEST_FILTER_ENABLE = False  # request 去重
-# ITEM_FILTER_SETTING = dict(
-#     filter_type=1  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
-# )
-# REQUEST_FILTER_ENABLE = True  # request 去重
-# REQUEST_FILTER_SETTING = dict(
-#     filter_type=3,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
-#     expire_time=2592000,  # 过期时间1个月
-# )
-#
-# # 报警 支持钉钉、企业微信、邮件
-# # 钉钉报警
-# DINGDING_WARNING_URL = ""  # 钉钉机器人api
-# DINGDING_WARNING_PHONE = ""  # 报警人 支持列表,可指定多个
-# DINGDING_WARNING_ALL = False # 是否提示所有人, 默认为False
-# # 邮件报警
-# EMAIL_SENDER = ""  # 发件人
-# EMAIL_PASSWORD = ""  # 授权码
-# EMAIL_RECEIVER = ""  # 收件人 支持列表,可指定多个
-# EMAIL_SMTPSERVER = "smtp.163.com" # 邮件服务器 默认为163邮箱
-# # 企业微信报警
-# WECHAT_WARNING_URL = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=89f0b1e9-8d08-4e26-a563-cd6b07b9db14"  # 企业微信机器人api
+
+# item去重
+ITEM_FILTER_ENABLE = True  # item 去重
+ITEM_FILTER_SETTING = dict(
+    filter_type=5,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、 轻量去重(LiteFilter)= 4、集群去重(SwordFishFilter)= 5
+    expire_time=63072000,  # 过期时间2年
+    redis_url=["172.17.4.239:2479", "172.17.4.240:2579", "172.17.4.84:2379"],  # 集群节点
+)
+
+# 企业微信报警
 WECHAT_WARNING_URL = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=-4e26-a563-cd6b07b9db14"  # 企业微信机器人api
-WECHAT_WARNING_PHONE = "马国鹏"  # 报警人 将会在群内@此人, 支持列表,可指定多人
+WECHAT_WARNING_PHONE = "swordFish"  # 报警人 将会在群内@此人, 支持列表,可指定多人
 WECHAT_WARNING_ALL = True  # 是否提示所有人, 默认为False
-# # 时间间隔
+# 时间间隔
 WARNING_INTERVAL = 360  # 相同报警的报警时间间隔,防止刷屏; 0表示不去重
-# WARNING_LEVEL = "DEBUG"  # 报警级别, DEBUG / ERROR
-WARNING_LEVEL = "INFO"  # 报警级别, DEBUG / ERROR
+WARNING_LEVEL = "ERROR"  # 报警级别, DEBUG / ERROR
 WARNING_FAILED_COUNT = 2  # 任务失败数 超过WARNING_FAILED_COUNT则报警
-#
-#LOG_NAME = os.path.basename(os.getcwd())
 
-DTIME = time.strftime("%Y-%m-%d", time.localtime(time.time()))
-LOG_NAME = os.path.split(sys.argv[0])[-1].split('.')[0]
-LOG_PATH = "log/%s/%s.log" %(DTIME,LOG_NAME)  # log存储路径
-LOG_LEVEL = "INFO"
+# 日志设置
+DTIME = datetime.datetime.now().strftime("%Y-%m-%d")
+LOG_NAME = os.path.split(sys.argv[0])[-1].split(".")[0]
+LOG_PATH = "log/%s/%s.log" % (DTIME, LOG_NAME)  # log存储路径
+LOG_LEVEL = "ERROR"
 LOG_COLOR = True  # 是否带有颜色
-LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
-# LOG_IS_WRITE_TO_FILE = True  # 是否写文件
-# LOG_MODE = "w"  # 写文件的模式
+LOG_IS_WRITE_TO_CONSOLE = True  # 是否打印到控制台
+LOG_IS_WRITE_TO_FILE = True  # 是否写文件
+LOG_MODE = "w"  # 写文件的模式
 LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
 LOG_BACKUP_COUNT = 20  # 日志文件保留数量
 LOG_ENCODING = "utf8"  # 日志文件编码
 OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级 一般用不到
-#
-# # 切换工作路径为当前项目路径
-# project_path = os.path.abspath(os.path.dirname(__file__))
-# os.chdir(project_path)  # 切换工作路经
-# sys.path.insert(0, project_path)
-# print('当前工作路径为 ' + os.getcwd())
-
-# 代理服务-未解析的
-jy_proxy = {'socks5': {'url': 'http://socks.spdata.jianyu360.com/socks/getips?limit=100', 'decrypt': 'ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/'}}
-
-headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', 'Accept': '*/*'}
-
-# 文件存储功能的配置信息
-oss_={'key_id': 'LTAI4G5x9aoZx8dDamQ7vfZi', 'key_secret': 'Bk98FsbPYXcJe72n1bG3Ssf73acuNh',
-      'endpoint': 'oss-cn-beijing.aliyuncs.com', 'bucket_name': 'jy-datafile'}
-# oss_={'key_id': 'LTAI4G5x9aoZx8dDamQ7vfZi', 'key_secret': 'Bk98FsbPYXcJe72n1bG3Ssf73acuNh', 'endpoint': 'oss-cn-beijing-internal.aliyuncs.com', 'bucket_name': 'jy-editor'}
-
-author = {"dzr":"董钊瑞",'mgp':"马国鹏","lzz":"李宗泽"}
-
-# 线上代理服务的api地址
-JIANYU_PROXY_URL = 'http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch'
-JIANYU_PROXY_AUTHOR = 'Basic amlhbnl1MDAxOjEyM3F3ZSFB'
-
-# splash 渲染服务的api接口配置
-JIANYU_SPLASH_URL = 'http://8.131.72.226:8998/render.json'
-
-# 测试环境的redis集群 -- url去重专用
-REDISCLUSTER =  [
-                {"host": "192.168.3.207", "port": "2179"},
-                {"host": "192.168.3.166", "port": "2379"}
-            ]
-
-# 正式环境的redis集群 -- url去重专用
-# REDISCLUSTER =  [
-#                 {"host": "172.17.4.239", "port": "2479"},
-#                 {"host": "172.17.4.240", "port": "2579"},
-#                 {"host": "172.17.4.84", "port": "2379"}
-#             ]
+# elk服务
+LOG_IS_SEND_TO_LOGSTASH = False
+LOGSTASH_IP = "47.95.151.156"  # 已失效("47.95.151.156")
+LOGSTASH_PORT = 5044
+
+# 自建代理池
+SWORDFISH_PROXY_URL = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
+SWORDFISH_PROXY_AUTHOR = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
+
+# splash 渲染服务
+SWORDFISH_RENDER_URL = "http://59.110.6.43:8998/render.json"
+
+# 爬虫心跳
+HEARTBEAT_TABLE = "spider_heartbeat"  # 爬虫采集心跳记录表名
+
+# 远程bucket配置
+ALI_BUCKET_CONFIG = {
+    "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
+    "key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
+    "endpoint": "oss-cn-beijing-internal.aliyuncs.com",
+    "bucket_name": "jy-datafile"
+}

+ 13 - 30
FworkSpider/untils/WebCookiePool.py

@@ -1,19 +1,16 @@
-import json
 import sys
-import requests
-import re,execjs
-
 sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-# from utils.cookie_pool import PageCookiePool
+
 from feapder.utils.webdriver import WebDriver
 from feapder.utils.log import log
 from untils.cookie_pool import PageCookiePool
+import feapder.utils.tools as tools
+
 
 class WebCookiePool(PageCookiePool):
-    def __init__(self, redis_key, page_url=None,cookie_key=None,
-                 min_cookies=10000, must_contained_keys=(), keep_alive=False, **kwargs):
-        super(WebCookiePool, self).__init__(redis_key, page_url=None,
-                                           min_cookies=10000, must_contained_keys=(), keep_alive=False, **kwargs)
+
+    def __init__(self, redis_key, page_url, cookie_key, **kwargs):
+        super(WebCookiePool, self).__init__(redis_key, **kwargs)
         self.page_url = page_url
         self.cookie_key = cookie_key
         self._kwargs = kwargs
@@ -22,30 +19,16 @@ class WebCookiePool(PageCookiePool):
         self._kwargs.setdefault("driver_type", "FIREFOX")
 
     def create_cookie(self):
-        with WebDriver(**self._kwargs) as driver_pool:
-            import time
-            # time.sleep(1111)
+        with WebDriver(**self._kwargs) as browser:
             try:
-                # driver_pool = self.driver_pool.get()
-                driver_pool.get(self.page_url)
+                browser.get(self.page_url)
                 count = 0
-                while self.cookie_key not in driver_pool.cookies.keys():
-                    time.sleep(1)
-                    count+=1
-                    if count>=30:
+                while self.cookie_key not in browser.cookies.keys():
+                    tools.delay_time(1)
+                    count += 1
+                    if count >= 30:
                         return
-                cookies = driver_pool.cookies
+                cookies = browser.cookies
                 return cookies
             except Exception as e:
                 log.error(f"获取cookie失败,{e}")
-
-
-if __name__ == '__main__':
-    for i in range(10):
-        print(f'开始第{i+1}次获取cookie')
-        if i%3==0:
-            WebCookiePool(redis_key='gdcookie',cookie_key='SUB',page_url="https://weibo.com/p/1005051203448454/home?from=page_100505_profile&wvr=6&mod=data&is_all=1#place").create_cookie()
-        elif i%3==1:
-            WebCookiePool(redis_key='gd2cookie',cookie_key='locale',page_url="https://www.jianshu.com/p/4c5bc85fc3fd").create_cookie()
-        else:
-            WebCookiePool(redis_key='gd3cookie',cookie_key='cna',page_url="https://docs-next.crawlab.cn/zh/guide/installation/docker.html#%E5%A4%96%E9%83%A8-mongodb").create_cookie()

+ 13 - 2
FworkSpider/untils/__init__.py

@@ -1,7 +1,6 @@
 import oss2
 
-# from config.load import oss_conf
-from feapder.setting import oss_ as oss_conf
+from feapder.setting import ALI_BUCKET_CONFIG as oss_conf
 
 
 class AliYunService:
@@ -22,3 +21,15 @@ class AliYunService:
         auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
         bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
         bucket.put_object_from_file(key, filename)
+
+    def push_oss_from_stream(self, key, data):
+        """
+        流式上传oss
+
+        :param str key: 上传到OSS的文件名
+        :param data: 待上传的内容。
+        :type data: bytes,str或file-like object
+        """
+        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
+        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
+        bucket.put_object(key, data)

+ 212 - 169
FworkSpider/untils/attachment.py

@@ -1,20 +1,28 @@
 import hashlib
+import io
 import os
-import sys
 import traceback
 import uuid
-from urllib import request
+
 import requests
+import tqdm
 import urllib3
-from feapder.setting import headers
-from untils.execptions import AttachmentNullError
+
 from untils.aliyun import AliYunService
+from untils.execptions import AttachmentNullError
 from untils.proxy_pool import ProxyPool
-import time
-import tqdm
+
 urllib3.disable_warnings()
+
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
+    'Accept': '*/*'
+}
+
+
 class AttachmentDownloader:
-    '''附件下载模块'''
+    """附件下载模块"""
+
     def __init__(self):
         self.dir_name = 'file'
 
@@ -22,92 +30,141 @@ class AttachmentDownloader:
         if not os.path.exists(self.dir_name):
             os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
 
-    def create_file_path(self, filename, file_type):
+    def create_file(self, filename, file_type):
         self.create_dir()
-        sign = self.hex_sha1("{}_{}".format(filename, uuid.uuid4()))
-        tmp_name = "{}.{}".format(sign, file_type)
-        return "{}/{}".format(self.dir_name, tmp_name)
+        sign = self._hash("{}_{}".format(filename, uuid.uuid4()))
+        local_file_name = "{}.{}".format(sign, file_type)
+        return "{}/{}".format(self.dir_name, local_file_name)
+
+    def create_fid(self, data: bytes):
+        return self._hash(data)
 
-    def hex_sha1(self,val):
-        sha1 = hashlib.sha1()
+    @staticmethod
+    def _hash(val):
+        _sha1 = hashlib.sha1()
         if isinstance(val, bytes):
-            sha1.update(str(val).encode("utf-8"))
+            _sha1.update(str(val).encode("utf-8"))
         elif isinstance(val, str):
-            sha1.update(val.encode("utf-8"))
-        res = sha1.hexdigest()
-        return res
+            _sha1.update(val.encode("utf-8"))
+        return _sha1.hexdigest()
 
     @staticmethod
-    def create_fid(file_stream: bytes):
-        sha1 = hashlib.sha1()
-        if isinstance(file_stream, bytes):
-            sha1.update(str(file_stream).encode("utf-8"))
-        elif isinstance(file_stream, str):
-            sha1.update(file_stream.encode("utf-8"))
-        res = sha1.hexdigest()
-        return res
+    def clean_attachment(file_path):
+        """
+        删除文件
 
+        :param str file_path: 文件路径
+        """
+        try:
+            os.remove(file_path)
+        except FileNotFoundError:
+            pass
+
+    def remove(self, file):
+        self.clean_attachment(file)
 
     @staticmethod
-    def clean_attachment(file_path):
-        os.remove(file_path)
+    def get_mb(data):
+        """
+        获取数据的Mb
+
+        :param int data: 准备计算大小的内容
+        :return: float
+        """
+        _kb = float(data / 1024.0)
+        return float(_kb / 1024.0)
 
     @staticmethod
-    def getsize(file_path: str):
-        def _getsize(filename):
+    def getsize(data):
+        """
+        计算数据大小
+
+        :param data: 待上传的内容。
+        :type data: bytes,str或file-like object
+        :return str
+        """
+        size = 0
+        if isinstance(data, str):
             try:
-                return os.path.getsize(filename)
-            except:
-                return 0
+                size = os.path.getsize(data)
+            except FileNotFoundError:
+                pass
+        elif isinstance(data, bytes):
+            size = len(data)
+        else:
+            pass
 
-        _kb = float(_getsize(file_path)) / 1024
+        _kb = float(size) / 1024
+        result = "{:.1f} kb".format(_kb)
         if _kb >= 1024:
             _M = _kb / 1024
             if _M >= 1024:
                 _G = _M / 1024
-                return "{:.1f} G".format(_G)
+                result = "{:.1f} G".format(_G)
             else:
-                return "{:.1f} M".format(_M)
-        else:
-            return "{:.1f} kb".format(_kb)
+                result = "{:.1f} M".format(_M)
+        return result
 
-    @staticmethod
-    def _fetch_attachment(
-            url: str,
-            file_path: str,
-            enable_proxy=False,
-            allow_show_exception=False,
-            **kwargs
-    ):
+    def fetch_data(self, url, file=None, **kwargs):
+        """
+        数据下载
+
+        :param str url: 下载地址
+        :param file: 本地文件
+        :param dict kwargs: requests请求参数
+        :return:
+        """
+        enable_proxy = kwargs.pop('enable_proxy', False)
+        allow_show_exception = kwargs.pop('allow_show_exception', False)
+        method = kwargs.pop('method', 'get')
         request_params = {}
+        request_params.setdefault('data', kwargs.pop('data', None))
+        request_params.setdefault('cookies', kwargs.pop('cookies', None))
         request_params.setdefault('headers', kwargs.get('headers') or headers)
         request_params.setdefault('proxies', kwargs.get('proxies'))
-        request_params.setdefault('timeout', kwargs.get('timeout') or 60)
-        # request_params.setdefault('stream', kwargs.get('stream') or True)
-        request_params.setdefault('verify', kwargs.get('verify') or False)
-        if enable_proxy:
-            proxy = ProxyPool().get()
-        else:
-            proxy = {}
+        request_params.setdefault('timeout', kwargs.pop('timeout', 60))
+        request_params.setdefault('stream', kwargs.pop('stream', True))
+        request_params.setdefault('verify', kwargs.pop('verify', False))
+        request_params.setdefault('allow_redirects', kwargs.pop('allow_redirects', True))
+
         retries = 0
         while retries < 3:
             try:
-                with requests.get(url,stream=True, **request_params) as req:
-                    content_size = req.headers.get('Content-Length') or 0
-                    content_size = int(content_size)
-                    stream = b''
-                    if req.status_code == 200:
-                        with open(file_path, 'wb') as f:
-                            with tqdm.tqdm(total=content_size, unit='B', initial=0, unit_scale=True, unit_divisor=1024,
-                                      ascii=True,desc=file_path) as bar:
-                                for chunk in req.iter_content(chunk_size=1024*20):
-                                    if chunk:
-                                        f.write(chunk)
-                                    stream += chunk
-                                    bar.update(len(chunk))
-                        return stream
-                    else:
+                with requests.request(method, url, **request_params) as req:
+                    stream = io.BytesIO()
+                    lh = {k.lower(): v for k, v in req.headers.items()}
+                    '''内容长度'''
+                    cl = lh.get('content-length') or len(req.content)
+                    icl = int(cl)
+                    content_length = self.get_mb(icl)
+                    if content_length > 50:
+                        '''丢弃超过50Mb内容长度的文件'''
+                        return stream.getvalue()
+
+                    if req.status_code != 200:
                         retries += 1
+                        continue
+
+                    iter_content = req.iter_content(chunk_size=1024 * 20)
+                    with tqdm.tqdm(
+                            total=icl,
+                            unit='B',
+                            initial=0,
+                            unit_scale=True,
+                            unit_divisor=1024,  # 1M=1024Kb,单位换算
+                            ascii=True,
+                            desc=file) as bar:
+                        if file is not None:
+                            with open(file, 'wb') as f:
+                                for chunk in iter_content:
+                                    stream.write(chunk)
+                                    size = f.write(chunk)
+                                    bar.update(size)
+                        else:
+                            for chunk in iter_content:
+                                size = stream.write(chunk)
+                                bar.update(size)
+                    return stream.getvalue()
             except requests.RequestException:
                 if allow_show_exception:
                     traceback.print_exc()
@@ -116,129 +173,115 @@ class AttachmentDownloader:
                 retries += 1
         return b''
 
-    def fetch_attachment(
-            self,
-            file_name: str,
-            file_type: str,
-            download_url: str,
-            enable_proxy=False,
-            allow_request_exception=False,
-            **kwargs
-    ):
-        if not file_name or not file_type or not download_url:
-            raise AttachmentNullError
-        file_path = self.create_file_path(file_name, file_type)
-        file_stream = self._fetch_attachment(
-            download_url,
-            file_path,
-            enable_proxy,
-            allow_request_exception,
-            **kwargs
-        )
-        # file_stream = self.download_file(download_url,file_path,enable_proxy,allow_request_exception)
-        if len(file_stream) > 0:
-            fid = self.create_fid(file_stream)
-            '''上传/下载,无论失败成功都需要给出文件基础信息'''
+    def _push_oss_from_stream(self, file_name, file_type, url, **kw):
+        """
+        将数据流推送oss
+
+        :param str file_name: 文件名称
+        :param str file_type: 文件类型
+        :param str url: 下载地址
+        :param dict kw: 额外下载信息
+        :return: dict: 附件信息
+        """
+        stream = self.fetch_data(url, None, **kw)
+        if len(stream) > 0:
+            fid = self.create_fid(stream)
             try:
                 result = {
                     'filename': file_name,
                     'ftype': file_type,
                     'fid': "{}.{}".format(fid, file_type),
-                    'org_url': download_url,
-                    'size': self.getsize(file_path),
+                    'org_url': url,
+                    'size': self.getsize(stream),
                     'url': 'oss',
                 }
-                AliYunService().push_oss_from_local(result['fid'], file_path)
+                AliYunService().push_oss_from_stream(result['fid'], stream)
             except Exception:
                 result = {
                     'filename': file_name,
-                    'org_url': download_url,
+                    'org_url': url,
                 }
-            self.clean_attachment(file_path)
         else:
             result = {
                 'filename': file_name,
-                'org_url': download_url,
+                'org_url': url,
             }
         return result
 
-    def download_file(self, url, file_path, call_func=None,enable_proxy=False,data=None):
+    def _push_oss_from_file(self, file_name, file_type, url, **kw):
         """
-        Args:
-            url: 地址
-            file_path: 文件存储地址
-            call_func: 下载成功的回调
-        Returns:
+        将本地文件推送oss
+
+        :param str file_name: 文件名称
+        :param str file_type: 文件类型
+        :param str url: 下载地址
+        :param dict kw: 额外下载信息
+        :return: dict: 附件信息
         """
-        # proxies = kwargs.get('proxies') or None
-        # data = kwargs.get('data') or None
-        start_time = time.time()
-        def progress_callfunc(blocknum, blocksize, totalsize):
-            """回调函数
-            @blocknum : 已经下载的数据块
-            @blocksize : 数据块的大小
-            @totalsize: 远程文件的大小
-            """
-            speed = (blocknum * blocksize) / (time.time() - start_time)
-            # speed_str = " Speed: %.2f" % speed
-            speed_str = " Speed: %s" % format_size(speed)
-            recv_size = blocknum * blocksize
-
-            # 设置下载进度条
-            f = sys.stdout
-            pervent = recv_size / totalsize
-            percent_str = "%.2f%%" % (pervent * 100)
-            n = round(pervent * 50)
-            s = ('#' * n).ljust(50, '-')
-            f.write(percent_str.ljust(8, ' ') + '[' + s + ']' + speed_str)
-            f.flush()
-            f.write('\r')
-
-        def format_size(bytes):
+        file = self.create_file(file_name, file_type)
+        stream = self.fetch_data(url, file, **kw)
+        '''上传/下载,无论失败成功都需要返回文件基础信息'''
+        if len(stream) > 0:
+            fid = self.create_fid(stream)
             try:
-                bytes = float(bytes)
-                kb = bytes / 1024
-            except:
-                print("传入的字节格式不对")
-                return "Error"
-            if kb >= 1024:
-                M = kb / 1024
-                if M >= 1024:
-                    G = M / 1024
-                    return "%.3fG" % (G)
-                else:
-                    return "%.3fM" % (M)
-            else:
-                return "%.3fK" % (kb)
+                result = {
+                    'filename': file_name,
+                    'ftype': file_type,
+                    'fid': "{}.{}".format(fid, file_type),
+                    'org_url': url,
+                    'size': self.getsize(file),
+                    'url': 'oss',
+                }
+                AliYunService().push_oss_from_local(result['fid'], file)
+            except Exception:
+                result = {
+                    'filename': file_name,
+                    'org_url': url,
+                }
+        else:
+            result = {
+                'filename': file_name,
+                'org_url': url,
+            }
+        '''删除本地临时文件'''
+        self.remove(file)
+        return result
 
-        if url:
-            try:
-                if enable_proxy:
-                    proxies = ProxyPool().get()
-                    # create the object, assign it to a variable
-                    proxy = request.ProxyHandler(proxies)
-                    # construct a new opener using your proxy settings
-                    opener = request.build_opener(proxy)
-                    # install the openen on the module-level
-                    request.install_opener(opener)
-                # 测试可以打开进度条,生产环境禁用进度条
-                filename, headers = request.urlretrieve(url, file_path, progress_callfunc, data)
-                # filename, headers = request.urlretrieve(url, file_path, data)
-                print(filename,headers)
-
-                if callable(call_func):
-                    call_func()
-                return filename
-            except Exception as e:
-                print(e)
-                return ''
+    def _fetch_attachment(self, file_name, file_type, download_url, **kwargs):
+        """
+        下载附件
+
+        :param str file_name: 文件名称
+        :param str file_type: 文件类型
+        :param str download_url: 下载地址
+        :param dict kwargs: 额外的附件下载配置
+        :return: dict: 附件
+        """
+        mode = kwargs.pop('mode', 'local')
+        if mode == "stream":
+            res = self._push_oss_from_stream(
+                file_name,
+                file_type,
+                download_url,
+                **kwargs
+            )
         else:
-            return ''
+            res = self._push_oss_from_file(
+                file_name,
+                file_type,
+                download_url,
+                **kwargs
+            )
+        return res
 
-if __name__ == '__main__':
+    def fetch_attachment(
+            self,
+            file_name: str,
+            file_type: str,
+            download_url: str,
+            **kw
+    ):
+        if not file_name or not file_type or not download_url:
+            raise AttachmentNullError
 
-    url = 'https://gdgpo.czt.gd.gov.cn/gpx-bid-file/440606/gpx-tender/2022/5/9/8a7e15d780a438400180a6be91e90cb2.zip?accessCode=0cf1d12a48345bcb7e64ac9583e30207'
-    attachment = AttachmentDownloader().fetch_attachment(
-        file_name="file_name", file_type="pdf", download_url=url,
-        enable_proxy=False)
-    print(attachment)
+        return self._fetch_attachment(file_name, file_type, download_url, **kw)

+ 0 - 61
FworkSpider/untils/chaojiying.py

@@ -1,61 +0,0 @@
-#!/usr/bin/env python
-# coding:utf-8
-
-import requests
-from hashlib import md5
-
-class Chaojiying_Client(object):
-
-    def __init__(self, username, password, soft_id):
-        self.username = username
-        password =  password.encode('utf8')
-        self.password = md5(password).hexdigest()
-        self.soft_id = soft_id
-        self.base_params = {
-            'user': self.username,
-            'pass2': self.password,
-            'softid': self.soft_id,
-        }
-        self.headers = {
-            'Connection': 'Keep-Alive',
-            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
-        }
-
-    def PostPic(self, im, codetype):
-        """
-        im: 图片字节
-        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
-        """
-        params = {
-            'codetype': codetype,
-        }
-        params.update(self.base_params)
-        files = {'userfile': ('ccc.jpg', im)}
-        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
-        return r.json()
-
-    def ReportError(self, im_id):
-        """
-        im_id:报错题目的图片ID
-        """
-        params = {
-            'id': im_id,
-        }
-        params.update(self.base_params)
-        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
-        return r.json()
-
-
-if __name__ == '__main__':
-    # chaojiying = Chaojiying_Client('ddddjy', 'ddddjy2021', '超级鹰')	#用户中心>>软件ID 生成一个替换 96001
-    # im = open('a.jpg', 'rb').read()													#本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
-    # # print(chaojiying.PostPic(im, 1902))
-    # res = chaojiying.PostPic(im, 2004)
-    # print(res)
-    # if res.get("err_no") != 0:
-    #     chaojiying.ReportError(res.get("pic_id"))
-    # if res.get("")
-    code = "haoho"
-    url = 'http://www.ccgp-fujian.gov.cn/3500/noticelist/e8d2cd51915e4c338dc1c6ee2f02b127/?page={page}&verifycode=胡吃海喝'[:-4]+code
-
-    print(url)

+ 0 - 0
FworkSpider/untils/clean_html/__init__.py


+ 0 - 131
FworkSpider/untils/clean_html/defaults.py

@@ -1,131 +0,0 @@
-import re
-
-__all__ = ['cleaner']
-
-# 独立元素
-INDEPENDENT_TAGS = {
-    '<head>[\s\S]*?</head>': '',
-    '<html>|<html [^>]*>|</html>': '',
-    '<body>|<body [^>]*>|</body>': '',
-    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
-    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
-    '\\xa0|\\u3000': '',  # 空格
-    '<!--[\s\S]*?-->': '',  # 注释
-    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
-    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
-    '<input>': '',  # 输入框
-    '<img[^>]*>': '<br>',  # 图片
-}
-# 行内元素
-INLINE_TAGS = {
-    '<a>|<a [^>]*>|</a>': '',  # 超链接
-    '<span>|<span [^>]*>|</span>': '',  # span
-    '<label>|<label [^>]*>|</label>': '<br>',  # label
-    '<font>|<font [^>]*>|</font>': '',  # font
-}
-# 块级元素
-BLOCK_TAGS = {
-    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
-    '<p>|<p [^>]*>|</p>': '<br>',  # 段落
-    '<div>|<div [^>]*>|</div>': '<br>',  # 分割 division
-    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
-}
-# 其他
-OTHER = {
-    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
-    '<epointform>': '',
-    '<!doctype html>|<!doctype html [^>]*>': '',
-    '【关闭】|关闭': '',
-    '【打印】|打印本页': '',
-    '【字体:[\s\S]*】': '',
-    '文章来源:[\u4e00-\u9fa5]+': '',
-    '浏览次数:.*[<]+': '',
-    '(责任编辑:.*?)': '',
-    '分享到[:]': '',
-}
-# 样式
-CSS_STYLE = {
-    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
-    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
-    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
-    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
-    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
-    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
-}
-# 空白符
-BLANKS = {
-    '\n\s*\n': '\n',
-    '\s*\n\s*': '\n',
-    '[^\S\n]': ' ',
-    '\s+': ' ',
-}
-# css标签集合
-TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
-# css属性集合
-ATTRS = {'id', 'class', 'style', 'width'}
-
-
-def _repair_tag():
-    """异常的标签组合,用来替换非标准页面的标签"""
-    _repairs = {}
-    for tag in TAGS:
-        for attr in ATTRS:
-            key = '{}{}'.format(tag, attr)
-            val = '{} {}'.format(tag, attr)
-            _repairs[key] = val
-    return _repairs
-
-
-def _escape_character(html):
-    """转义字符"""
-    html = html.replace('&lt;', '<')
-    html = html.replace('&gt;', '>')
-    html = html.replace('&quot;', '"')
-    html = html.replace('&amp;', '&')
-    return html
-
-
-def _lowercase_tag(html):
-    """标签归一化处理(全部小写)"""
-    tags = re.findall("<[^>]+>", html)
-    for tag in tags:
-        html = html.replace(tag, str(tag).lower())
-
-    repair_tags = _repair_tag()
-    for err, right in repair_tags.items():
-        html = html.replace(err, right)
-
-    return html
-
-
-def cleaner(html, special=None, completely=False):
-    """
-    数据清洗
-
-    :param html: 清洗的页面
-    :param special: 额外指定页面清洗规则
-    :param completely: 是否完全清洗页面
-    :return: 清洗后的页面源码
-    """
-    if special is None:
-        special = {}
-    OTHER.update(special)
-    remove_tags = {
-        **INDEPENDENT_TAGS,
-        **INLINE_TAGS,
-        **BLOCK_TAGS,
-        **OTHER,
-        **CSS_STYLE,
-        **BLANKS,
-    }
-    html = _lowercase_tag(html)
-    for tag, repl in remove_tags.items():
-        html = re.sub(tag, repl, html)
-
-    if completely:
-        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
-        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
-        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
-
-    html = _escape_character(html)
-    return html

+ 0 - 136
FworkSpider/untils/cleaner.py

@@ -1,136 +0,0 @@
-import re
-__all__ = ['cleaner']
-
-# 独立元素
-INDEPENDENT_TAGS = {
-    '<head>[\s\S]*?</head>': '',
-    '<html>|<html [^>]*>|</html>': '',
-    '<body>|<body [^>]*>|</body>': '',
-    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
-    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
-    '\\xa0|\\u3000': '',  # 空格
-    '<!--[\s\S]*?-->': '',  # 注释
-    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
-    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
-    '<input>': '',  # 输入框
-    '<img[^>]*>': '<br>',  # 图片
-}
-# 行内元素
-INLINE_TAGS = {
-    '<a>|<a [^>]*>|</a>': '',  # 超链接
-    '<link>|<link [^>]*>|</link>': '',  # 超链接
-    '<span>|<span [^>]*>|</span>': '',  # span
-    '<label>|<label [^>]*>|</label>': '<br>',  # label
-    '<font>|<font [^>]*>|</font>': '',  # font
-}
-# 块级元素
-BLOCK_TAGS = {
-    '<div>\s*?</div>':'',
-    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
-    '<p>|<p [^>]*>': '<br>',  # 段落
-    '</p>': '',  # 段落
-    '<div>|<div [^>]*>': '<br>',  # 分割 division
-    '</div>': '',  # 分割 division
-    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
-}
-# 其他
-OTHER = {
-    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
-    '<epointform>': '',
-    '<!doctype html>|<!doctype html [^>]*>': '',
-    '【关闭】|关闭': '',
-    '【打印】|打印本页': '',
-    '【字体:[\s\S]*】': '',
-    '文章来源:[\u4e00-\u9fa5]+': '',
-    '浏览次数:.*[<]+': '',
-    '(责任编辑:.*?)': '',
-    '分享到[:]': '',
-
-}
-# 样式
-CSS_STYLE = {
-    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
-    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
-    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
-    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
-    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
-    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
-
-}
-# 空白符
-BLANKS = {
-    '\n\s*\n': '\n',
-    '\s*\n\s*': '\n',
-    '[^\S\n]': ' ',
-    '\s+': ' ',
-}
-# css标签集合
-TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
-# css属性集合
-ATTRS = {'id', 'class', 'style', 'width'}
-
-
-def _repair_tag():
-    """异常的标签组合,用来替换非标准页面的标签"""
-    _repairs = {}
-    for tag in TAGS:
-        for attr in ATTRS:
-            key = '{}{}'.format(tag, attr)
-            val = '{} {}'.format(tag, attr)
-            _repairs[key] = val
-    return _repairs
-
-
-def _escape_character(html):
-    """转义字符"""
-    html = html.replace('&lt;', '<')
-    html = html.replace('&gt;', '>')
-    html = html.replace('&quot;', '"')
-    html = html.replace('&amp;', '&')
-    return html
-
-
-def _lowercase_tag(html):
-    """标签归一化处理(全部小写)"""
-    tags = re.findall("<[^>]+>", html)
-    for tag in tags:
-        html = html.replace(tag, str(tag).lower())
-
-    repair_tags = _repair_tag()
-    for err, right in repair_tags.items():
-        html = html.replace(err, right)
-
-    return html
-
-
-def cleaner(html, special=None, completely=False):
-    """
-    数据清洗
-
-    :param html: 清洗的页面
-    :param special: 额外指定页面清洗规则
-    :param completely: 是否完全清洗页面
-    :return: 清洗后的页面源码
-    """
-    if special is None:
-        special = {}
-    OTHER.update(special)
-    remove_tags = {
-        **INDEPENDENT_TAGS,
-        **INLINE_TAGS,
-        **BLOCK_TAGS,
-        **OTHER,
-        **CSS_STYLE,
-        **BLANKS,
-    }
-    html = _lowercase_tag(html)
-    for tag, repl in remove_tags.items():
-        html = re.sub(tag, repl, html)
-
-    if completely:
-        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
-        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
-        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
-
-    html = _escape_character(html)
-    return html

+ 62 - 654
FworkSpider/untils/cookie_pool.py

@@ -1,227 +1,50 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018/12/27 11:32 AM
----------
-@summary: cookie池
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import abc
-import datetime
-import random
 import time
-import warnings
 from collections import Iterable
-from enum import Enum, unique
-import requests
-from feapder.db.mongodb import MongoDB
 
-import feapder.utils.tools as tools
-from feapder import setting
-from feapder.network import user_agent
+from func_timeout import func_set_timeout
 
-from feapder.db.mysqldb import MysqlDB
+import feapder.utils.tools as tools
+from feapder.db.mongodb import MongoDB
 from feapder.db.redisdb import RedisDB
-from feapder.utils import metrics
+from feapder.network.cookie_pool import (
+    CookiePoolInterface,
+    PageCookiePool,
+    User,
+)
 from feapder.utils.log import log
 from feapder.utils.redis_lock import RedisLock
-from feapder.utils.tools import send_msg
-from feapder.utils.webdriver import WebDriver
-
+from feapder.utils.tools import get_current_date
 
-class CookiePoolInterface(metaclass=abc.ABCMeta):
-    """
-    cookie pool interface
-    """
-
-    @abc.abstractmethod
-    def create_cookie(self, *args, **kwargs):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def get_cookie(self, *args, **kwargs):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def del_cookie(self, *args, **kwargs):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def run(self):
-        raise NotImplementedError
-
-
-class PageCookiePool(CookiePoolInterface):
-    """
-    由页面产生的cookie 不需要用户登陆
-    """
-
-    def __init__(
-        self,
-        redis_key,
-        page_url=None,
-        min_cookies=10000,
-        must_contained_keys=(),
-        keep_alive=False,
-        **kwargs,
-    ):
-        """
-        @param redis_key: 项目名
-        @param page_url: 生产cookie的url
-        @param min_cookies: 最小cookie数
-        @param must_contained_keys: cookie 必须包含的key
-        @param keep_alive: 当cookie数量足够是是否保持随时待命,生产cookie的状态。False为否,满足则退出
-        ---
-        @param kwargs: WebDriver的一些参数
-            load_images: 是否加载图片
-            user_agent_pool: user-agent池 为None时不使用
-            proxies_pool: ;代理池 为None时不使用
-            headless: 是否启用无头模式
-            driver_type: web driver 类型
-            timeout: 请求超时时间 默认16s
-            window_size: 屏幕分辨率 (width, height)
-
-        """
-
-        self._redisdb = RedisDB()
-
-        self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
-        self._tab_cookie_pool_last_count = "{}:str_cookie_pool_count".format(
-            redis_key
-        )  # 存储上一次统计cookie 数量的时间,格式为 时间戳:数量
-        self._page_url = page_url
-        self._min_cookies = min_cookies
-        self._must_contained_keys = must_contained_keys
-        self._keep_alive = keep_alive
-
-        self._kwargs = kwargs
-        self._kwargs.setdefault("load_images", False)
-        self._kwargs.setdefault("headless", True)
-
-    def create_cookie(self):
-        """
-        可能会重写
-        @return:
-        """
-        url = self._page_url
-        header = {
-            "Upgrade-Insecure-Requests": "1",
-            "User-Agent": user_agent.get()
-        }
-        res = requests.get(url, headers=header)
-        cookies = requests.utils.dict_from_cookiejar(res.cookies)
-        return cookies
-
-
-    def add_cookies(self, cookies):
-        log.info("添加cookie {}".format(cookies))
-        self._redisdb.lpush(self._tab_cookie_pool, cookies)
-    def run(self):
-        while True:
-            try:
-                now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool)
-                need_cookie_count = self._min_cookies - now_cookie_count
-
-                if need_cookie_count > 0:
-                    log.info(
-                        "当前cookie数为 {} 小于 {}, 生产cookie".format(
-                            now_cookie_count, self._min_cookies
-                        )
-                    )
-                    try:
-                        print('????')
-                        cookies = self.create_cookie()
-                        if cookies:
-                            self.add_cookies(cookies)
-                    except Exception as e:
-                        log.exception(e)
-                else:
-                    log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count))
-
-                    # 判断cookie池近一分钟数量是否有变化,无变化则认为爬虫不再用了,退出
-                    last_count_info = self._redisdb.strget(
-                        self._tab_cookie_pool_last_count
-                    )
-                    if not last_count_info:
-                        self._redisdb.strset(
-                            self._tab_cookie_pool_last_count,
-                            "{}:{}".format(time.time(), now_cookie_count),
-                        )
-                    else:
-                        last_time, last_count = last_count_info.split(":")
-                        last_time = float(last_time)
-                        last_count = int(last_count)
-
-                        if time.time() - last_time > 60:
-                            if now_cookie_count == last_count:
-                                log.info("近一分钟,cookie池数量无变化,判定爬虫未使用,退出生产")
-                                break
-                            else:
-                                self._redisdb.strset(
-                                    self._tab_cookie_pool_last_count,
-                                    "{}:{}".format(time.time(), now_cookie_count),
-                                )
-
-                    if self._keep_alive:
-                        log.info("sleep 10")
-                        tools.delay_time(10)
-                    else:
-                        break
-
-            except Exception as e:
-                log.exception(e)
-                tools.delay_time(1)
-
-    def get_cookie(self, wait_when_null=True):
-        while True:
-            try:
-                cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool)
-                if not cookie_info and wait_when_null:
-                    log.info("暂无cookie 生产中...")
-                    self._keep_alive = False
-                    self._min_cookies = 1
-                    with RedisLock(
-                        key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5
-                    ) as _lock:
-                        if _lock.locked:
-                            self.run()
-                    continue
-                return eval(cookie_info) if cookie_info else {}
-            except Exception as e:
-                log.exception(e)
-                tools.delay_time(1)
-
-    def del_cookie(self, cookies):
-        self._redisdb.lrem(self._tab_cookie_pool, cookies)
-
-# PageCookiePool('cookie_1',page_url="https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoListInit.do").create_cookie()
-class User:
-    def __init__(self, username, cookie):
-        self.username = username
-        self.cookie = cookie
+__all__ = [
+    "PageCookiePool",
+    "User",
+    "LoginCookiePool"
+]
 
 
 class LoginCookiePool(CookiePoolInterface):
     """
-    需要登陆的cookie池, 用户账号密码等信息用mysql保存
+    需要登陆的cookie池, 用户账号密码等信息用mongoDB保存
     """
 
     def __init__(
-        self,
-        redis_key,
-        *,
-        table_userbase,
-        login_state_key="login_state",
-        lock_state_key="lock_state",
-        username_key="username",
-        password_key="password",
-        login_retry_times=10,
+            self,
+            redis_key,
+            *,
+            login_site,
+            table_userbase="feapder_login",
+            table_login_record="feapder_login_record",
+            login_state_key="login_state",
+            lock_state_key="lock_state",
+            username_key="username",
+            password_key="password",
+            login_retry_times=10,
     ):
         """
         @param redis_key: 项目名
+        @param login_site: 网站名称
         @param table_userbase: 用户表名
+        @param table_login_record: 用户登录状态表名
         @param login_state_key: 登录状态列名
         @param lock_state_key: 封锁状态列名
         @param username_key: 登陆名列名
@@ -232,15 +55,15 @@ class LoginCookiePool(CookiePoolInterface):
         self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
         self._login_retry_times = login_retry_times
         self._table_userbase = table_userbase
+        self._table_login_record = table_login_record
         self._login_state_key = login_state_key
         self._lock_state_key = lock_state_key
         self._username_key = username_key
         self._password_key = password_key
-
+        self._login_site = login_site
         self._redisdb = RedisDB()
         self._mongo = MongoDB(db='user_login')
 
-
     def create_cookie(self, username, password):
 
         """
@@ -257,7 +80,12 @@ class LoginCookiePool(CookiePoolInterface):
         @return: yield username, password
         """
 
-        return self._mongo.find(self._table_userbase,{self._lock_state_key:0,self._login_state_key:0})
+        query = {
+            "site": self._login_site,
+            self._lock_state_key: 0,
+            self._login_state_key: 0
+        }
+        return self._mongo.find(self._table_userbase, query)
 
     def handle_login_failed_user(self, username, password):
         """
@@ -279,14 +107,19 @@ class LoginCookiePool(CookiePoolInterface):
 
     def save_cookie(self, username, cookie):
         user_cookie = {"username": username, "cookie": cookie}
-
         self._redisdb.lpush(self._tab_cookie_pool, user_cookie)
         self._mongo.add(
-                coll_name=self._table_userbase,
-                data={self._login_state_key:1},
-                update_columns=self._username_key,
-                update_columns_value=username)
+            coll_name=self._table_login_record,
+            data={self._login_state_key: 1,
+                  "status": "create",
+                  "site": self._login_site,
+                  "login_time": time.strftime("%Y-%m-%d %H:%M:%S",
+                                              time.localtime(
+                                                  int(round(time.time()))))},
+            update_columns=self._username_key,
+            update_columns_value=username)
 
+    @func_set_timeout(60)
     def get_cookie(self, wait_when_null=True) -> User:
         while True:
             try:
@@ -315,22 +148,30 @@ class LoginCookiePool(CookiePoolInterface):
         self._redisdb.lrem(self._tab_cookie_pool, user_info)
 
         self._mongo.add(
-            coll_name=self._table_userbase,
-            data={self._login_state_key: 1},
+            coll_name=self._table_login_record,
+            data={
+                self._login_state_key: 1,
+                "status": "remove",
+                "site": self._login_site,
+                "login_time": get_current_date()
+            },
             update_columns=self._username_key,
             update_columns_value=user.username)
 
     def user_is_locked(self, user: User):
-
         self._mongo.add(
-            coll_name=self._table_userbase,
-            data={self._lock_state_key: 1},
+            coll_name=self._table_login_record,
+            data={
+                self._lock_state_key: 1,
+                "site": self._login_site,
+                "login_time": get_current_date()
+            },
             update_columns=self._username_key,
             update_columns_value=user.username)
 
     def run(self):
         with RedisLock(
-            key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=100
+                key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=100
         ) as _lock:
             if _lock.locked:
                 user_infos = self.get_user_info()
@@ -349,7 +190,8 @@ class LoginCookiePool(CookiePoolInterface):
                             if cookie:
                                 self.save_cookie(username, cookie)
                             else:
-                                self.handle_login_failed_user(username, password)
+                                self.handle_login_failed_user(username,
+                                                              password)
 
                             break
                         except Exception as e:
@@ -359,437 +201,3 @@ class LoginCookiePool(CookiePoolInterface):
                         self.handle_login_failed_user(username, password)
 
     login = run
-
-
-@unique
-class LimitTimesUserStatus(Enum):
-    # 使用状态
-    USED = "used"
-    SUCCESS = "success"
-    OVERDUE = "overdue"  # cookie 过期
-    SLEEP = "sleep"
-    EXCEPTION = "exception"
-    # 登陆状态
-    LOGIN_SUCCESS = "login_success"
-    LOGIN_FALIED = "login_failed"
-
-
-class LimitTimesUser:
-    """
-    有次数限制的账户
-    基于本地做的缓存,不支持多进程调用
-    """
-
-    ACCOUNT_INFO_KEY = "accounts:h_account_info"  # 存储cookie的redis key
-    SITE_NAME = ""  # 网站名
-
-    redisdb = None
-
-    def __init__(
-        self,
-        username,
-        password,
-        max_search_times,
-        proxies=None,
-        search_interval=0,
-        **kwargs,
-    ):
-        """
-        @param username:
-        @param password:
-        @param max_search_times:
-        @param proxies:
-        @param search_interval: 调用时间间隔。 支持元组 指定间隔的时间范围 如(5,10)即5到10秒;或直接传整数
-        """
-        self.__dict__.update(kwargs)
-        self.username = username
-        self.password = password
-        self.max_search_times = max_search_times
-        self.proxies = proxies
-        self.search_interval = search_interval
-        self.delay_use = 0  # 延时使用,用于等待解封的用户
-
-        if isinstance(search_interval, (tuple, list)):
-            if len(search_interval) != 2:
-                raise ValueError("search_interval 需传递两个值的元组或列表。如(5,10)即5到10秒")
-
-            self.used_for_time_length = (
-                search_interval[1] * 5
-            )  # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占
-        else:
-            self.used_for_time_length = (
-                search_interval * 5
-            )  # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占
-
-        self.account_info = {
-            "login_time": 0,
-            "cookies": {},
-            "search_times": 0,
-            "last_search_time": 0,
-            "used_for_spider_name": None,  # 只被某个爬虫使用 其他爬虫不可使用
-            "init_search_times_time": 0,  # 初始化搜索次数的时间
-        }
-
-        if not self.__class__.redisdb:
-            self.__class__.redisdb = RedisDB()
-
-        self.sync_account_info_from_redis()
-
-        self.__init_metrics()
-
-    def __init_metrics(self):
-        """
-        初始化打点系统
-        @return:
-        """
-        metrics.init(**setting.METRICS_OTHER_ARGS)
-
-    def record_user_status(self, status: LimitTimesUserStatus):
-        metrics.emit_counter(f"{self.username}:{status.value}", 1, classify="users")
-
-    def __repr__(self):
-        return "<LimitTimesUser {} | cookies:{}>".format(self.username, self.cookies)
-
-    def __eq__(self, other):
-        return self.username == other.username
-
-    def sync_account_info_from_redis(self):
-        account_info = self.redisdb.hget(self.ACCOUNT_INFO_KEY, self.username)
-        if account_info:
-            account_info = eval(account_info)
-            self.account_info.update(account_info)
-
-    @property
-    def cookies(self):
-        cookies = self.account_info.get("cookies")
-        return cookies
-
-    def set_cookies(self, cookies):
-        self.account_info["cookies"] = cookies
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    def set_login_time(self, login_time=None):
-        self.account_info["login_time"] = login_time or time.time()
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    def get_login_time(self):
-        return self.account_info.get("login_time")
-
-    def is_time_to_login(self):
-        return time.time() - self.get_login_time() > 40 * 60
-
-    def get_last_search_time(self):
-        return self.account_info.get("last_search_time", 0)
-
-    def is_time_to_search(self):
-        if self.delay_use:
-            is_time = time.time() - self.get_last_search_time() > self.delay_use
-            if is_time:
-                self.delay_use = 0
-
-        else:
-            is_time = time.time() - self.get_last_search_time() > (
-                random.randint(*self.search_interval)
-                if isinstance(self.search_interval, (tuple, list))
-                else self.search_interval
-            )
-
-        return is_time
-
-    @property
-    def used_for_spider_name(self):
-        return self.account_info.get("used_for_spider_name")
-
-    @used_for_spider_name.setter
-    def used_for_spider_name(self, spider_name):
-        self.account_info["used_for_spider_name"] = spider_name
-
-    def update_status(self):
-        """
-        更新search的一些状态
-        @return:
-        """
-        self.account_info["search_times"] += 1
-        self.account_info["last_search_time"] = time.time()
-
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    @property
-    def search_times(self):
-        init_search_times_time = self.account_info.get("init_search_times_time")
-        current_time = time.time()
-        if (
-            current_time - init_search_times_time >= 86400
-        ):  # 如果距离上次初始化搜索次数时间大于1天,则搜索次数清清零
-            self.account_info["search_times"] = 0
-            self.account_info["init_search_times_time"] = current_time
-
-            self.redisdb.hset(self.ACCOUNT_INFO_KEY, self.username, self.account_info)
-
-        return self.account_info["search_times"]
-
-    def is_overwork(self):
-        if self.search_times > self.max_search_times:
-            log.warning("账号 {} 请求次数超限制".format(self.username))
-            return True
-
-        return False
-
-    def is_at_work_time(self):
-        if datetime.datetime.now().hour in list(range(7, 23)):
-            return True
-
-        log.warning("账号 {} 不再工作时间内".format(self.username))
-        return False
-
-    def del_cookie(self):
-        self.account_info["cookies"] = {}
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    def create_cookie(self):
-        """
-        生产cookie 有异常需要抛出
-        @return: cookie_dict
-        """
-
-        raise NotImplementedError
-
-    def login(self):
-        """
-        @return: 1 成功 0 失败
-        """
-
-        try:
-            # 预检查
-            if not self.is_time_to_login():
-                log.info("此账号尚未到登陆时间: {}".format(self.username))
-                time.sleep(5)
-                return 0
-
-            cookies = self.create_cookie()
-            if not cookies:
-                raise Exception("登陆失败 未获取到合法cookie")
-
-            if not isinstance(cookies, dict):
-                raise Exception("cookie 必须为字典格式")
-
-            # 保存cookie
-            self.set_login_time()
-            self.set_cookies(cookies)
-            log.info("登录成功 {}".format(self.username))
-            self.record_user_status(LimitTimesUserStatus.LOGIN_SUCCESS)
-            return 1
-
-        except Exception as e:
-            log.exception(e)
-            send_msg(
-                msg=f"{self.SITE_NAME} {self.username} 账号登陆异常 exception: {str(e)}",
-                level="error",
-                message_prefix=f"{self.SITE_NAME} {self.username} 账号登陆异常",
-            )
-
-        log.info("登录失败 {}".format(self.username))
-        self.record_user_status(LimitTimesUserStatus.LOGIN_FALIED)
-        return 0
-
-
-class LimitTimesUserPool:
-    """
-    限制查询次数的用户的User pool
-    基于本地做的缓存,不支持多进程调用
-    """
-
-    LOAD_USER_INTERVAL = 60
-
-    def __init__(self, *, accounts_dict, limit_user_class, support_more_client=True):
-        """
-        @param accounts_dic: 账户信息字典
-            {
-                "15011300228": {
-                    "password": "300228",
-                    "proxies": {},
-                    "max_search_times": 500,
-                    "search_interval": 1, # 使用时间间隔
-                    # 其他携带信息
-                }
-            }
-        @param limit_user_class: 用户重写的 limit_user_class
-        @param support_more_client: 是否支持多客户端 即多线程 多进程模式 (可能在计数上及使用频率上有些误差)
-        """
-        self.accounts_dict = accounts_dict
-        self.limit_user_class = limit_user_class
-
-        self.limit_times_users = []
-        self.current_user_index = -1
-
-        self.support_more_client = support_more_client
-
-        self.last_load_user_time = 0
-
-    def __load_users(self, username=None):
-        # 装载user
-        log.info("更新可用用户")
-
-        for _username, detail in self.accounts_dict.items():
-            if username and username != _username:
-                continue
-
-            limit_times_users = self.limit_user_class(username=_username, **detail)
-            if limit_times_users in self.limit_times_users:
-                continue
-
-            if limit_times_users.is_overwork():
-                continue
-            else:
-                if (
-                    limit_times_users.cookies or limit_times_users.login()
-                ):  # 如果有cookie 或者登陆成功 则添加到可用的user队列
-                    self.limit_times_users.append(limit_times_users)
-
-        self.last_load_user_time = time.time()
-
-    def get_user(
-        self,
-        username=None,
-        used_for_spider_name=None,
-        wait_when_null=True,
-        not_limit_frequence=False,
-    ) -> LimitTimesUser:
-        """
-        @params username: 获取指定的用户
-        @params used_for_spider_name: 独享式使用,独享爬虫的名字。其他爬虫不可抢占
-        @params wait_when_null: 无用户时是否等待
-        @params not_limit_frequence: 不限制使用频率
-        @return: LimitTimesUser
-        """
-        if not self.support_more_client:
-            warnings.warn(
-                "LimitTimesUserCookiePool 取查询次数等信息时基于本地做的缓存,不支持多进程或多线程",
-                category=Warning,
-            )
-            self._is_show_warning = True
-
-        while True:
-            if (
-                not self.limit_times_users
-                or time.time() - self.last_load_user_time >= self.LOAD_USER_INTERVAL
-            ):
-                self.__load_users(username)
-                if not self.limit_times_users:
-                    log.warning("无可用的用户")
-                    if wait_when_null:
-                        time.sleep(1)
-                        continue
-                    else:
-                        return None
-
-            self.current_user_index += 1
-            self.current_user_index = self.current_user_index % len(
-                self.limit_times_users
-            )
-
-            limit_times_user = self.limit_times_users[self.current_user_index]
-            if self.support_more_client:  # 需要先同步下最新数据
-                limit_times_user.sync_account_info_from_redis()
-
-            if username and limit_times_user.username != username:
-                log.info(
-                    "{} 为非指定用户 {}, 获取下一个用户".format(limit_times_user.username, username)
-                )
-                time.sleep(1)
-                continue
-
-            # 独占式使用,若为其他爬虫,检查等待使用时间是否超过独占时间,若超过则可以使用
-            if (
-                limit_times_user.used_for_spider_name
-                and limit_times_user.used_for_spider_name != used_for_spider_name
-            ):
-                wait_time = time.time() - limit_times_user.get_last_search_time()
-                if wait_time < limit_times_user.used_for_time_length:
-                    log.info(
-                        "用户{} 被 {} 爬虫独占,需等待 {} 秒后才可使用".format(
-                            limit_times_user.username,
-                            limit_times_user.used_for_spider_name,
-                            limit_times_user.used_for_time_length - wait_time,
-                        )
-                    )
-                    time.sleep(1)
-                    continue
-
-            if (
-                not limit_times_user.is_overwork()
-                and limit_times_user.is_at_work_time()
-            ):
-                if not limit_times_user.cookies:
-                    self.limit_times_users.remove(limit_times_user)
-                    continue
-
-                if not_limit_frequence or limit_times_user.is_time_to_search():
-                    limit_times_user.used_for_spider_name = used_for_spider_name
-
-                    limit_times_user.update_status()
-                    log.info("使用用户 {}".format(limit_times_user.username))
-                    limit_times_user.record_user_status(LimitTimesUserStatus.USED)
-                    return limit_times_user
-                else:
-                    log.info("{} 用户使用间隔过短 查看下一个用户".format(limit_times_user.username))
-                    time.sleep(1)
-                    continue
-            else:
-                self.limit_times_users.remove(limit_times_user)
-                self.current_user_index -= 1
-
-                if not limit_times_user.is_at_work_time():
-                    log.warning("用户 {} 不在工作时间".format(limit_times_user.username))
-                    if wait_when_null:
-                        time.sleep(30)
-                        continue
-                    else:
-                        return None
-
-    def del_user(self, username):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.del_cookie()
-                self.limit_times_users.remove(limit_times_user)
-                limit_times_user.record_user_status(LimitTimesUserStatus.OVERDUE)
-                self.__load_users(username)
-                break
-
-    def update_cookies(self, username, cookies):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.set_cookies(cookies)
-                break
-
-    def delay_use(self, username, delay_seconds):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.delay_use = delay_seconds
-                limit_times_user.record_user_status(LimitTimesUserStatus.SLEEP)
-                break
-
-    def record_success_user(self, username):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.record_user_status(LimitTimesUserStatus.SUCCESS)
-
-    def record_exception_user(self, username):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.record_user_status(LimitTimesUserStatus.EXCEPTION)
-
-# if __name__ == '__main__':
-#     cookiepool = PageCookiePool(redis_key='fwork:gszfcg',
-#                                 page_url='http://www.ccgp-hubei.gov.cn/notice/cgyxgg/index_1.html',
-#                                 driver_type='FIREFOX',
-#                                 executable_path="D:\\geckodriver.exe")
-#     cookiepool.create_cookie()

+ 0 - 33
FworkSpider/untils/create_menus.py

@@ -1,33 +0,0 @@
-from feapder.db.mongodb import MongoDB
-
-
-class Details:
-    _to_db = None
-    _to_db_xs = None
-    db_name = 'mgp_list'
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    @property
-    def to_db_xs(self):
-        if not self._to_db_xs:
-            self._to_db_xs = MongoDB(port=27001,db='editor')
-        return self._to_db_xs
-    def main(self,page):
-        menus_list = []
-        data = self.to_db_xs.find("luaconfig",{"modifyuser":"maguopeng","param_common":{"$elemMatch": {"$regex": "广东省政府采购网", "$options": "$i"}}})
-        # print(data)
-        for item in data:
-            # print(item)
-            channls = item.get("param_common")[2]
-            code = item.get("code")
-            href = item.get("param_common")[11]
-            print("Menu"+"(",f"'{channls}',",f"'{code}',\n",f"'{href}',",page,"),")
-        #     menus_list.append(f'''Menu({channls},{code},{href},{page})''')
-        # print(menus_list)
-
-Details().main(2)

+ 11 - 15
FworkSpider/untils/execptions.py

@@ -1,19 +1,15 @@
+class PySpiderError(Exception):
 
-class CustomCheckError(Exception):
-
-    def __init__(self, code: int = 10002, reason: str = '特征条件检查失败'):
-        self.code = code
-        self.reason = reason
-
-
-class AttachmentNullError(Exception):
-
-    def __init__(self, code: int = 10004, reason: str = '附件下载失败'):
-        self.code = code
-        self.reason = reason
+    def __init__(self, *args, **kwargs):
+        if 'code' not in kwargs and 'reason' not in kwargs:
+            kwargs['code'] = 10000
+            kwargs['reason'] = '未知爬虫错误,请手动处理'
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+        super(PySpiderError, self).__init__(*args, kwargs)
 
 
-class CustomAccountPrivilegeError(Exception):
+class AttachmentNullError(PySpiderError):
 
-    def __init__(self, *args, **kwargs):
-        pass
+    def __init__(self, code: int = 10004, reason: str = '附件下载异常'):
+        super(AttachmentNullError, self).__init__(code=code, reason=reason)

+ 129 - 12
FworkSpider/untils/get_imgcode.py

@@ -1,21 +1,138 @@
 import requests
-from typing import Mapping
 
+__all__ = [
+    "swordfish_platform",
+    "chaojiying_platform",
+    "chaojiying_report",
+    "get_code",
+    "get_code_det",
+    "arithmetic_captcha",
+]
 
-def get_code(file_path: str) -> dict:
-    upload_address = "http://123.57.163.80:2119/v1/images/verify"
-    with open(file_path, 'rb') as f:
+headers = {"accept": "application/json"}
+
+
+def _pack_file(file):
+    """包装验证码格式"""
+    if isinstance(file, str) and file.startswith("data:image"):
+        img_file = {"file": file}
+    elif isinstance(file, bytes):
+        img_file = {"file": file}
+    else:
+        with open(file, "rb") as f:
+            img_bytes = f.read()
+        img_file = {"file": img_bytes}
+    return img_file
+
+
+def _simple_captcha(file):
+    """
+    普通验证码
+
+    @param file: 验证码 - 可以是图片或者图片base64编码
+    @return:
+    """
+    url = "http://123.57.163.80:2119/v1/images/verify"
+    files = _pack_file(file)
+    r = requests.post(url, headers=headers, files=files, stream=True, timeout=10)
+    rp_json = r.json()
+    if "msg" in rp_json and "success" == rp_json["msg"]:
+        return str(rp_json["r"]["code"]).upper()
+    return None
+
+
+def _arithmetic_captcha(file):
+    """算术验证码"""
+    url = "http://123.57.163.80:2119/v1/images/arithmetic"
+    files = _pack_file(file)
+    r = requests.post(url, headers=headers, files=files, stream=True, timeout=10)
+    json_resp = r.json()
+    if "msg" in json_resp and "success" == json_resp["msg"]:
+        return str(json_resp["r"]["code"]).upper()
+    return None
+
+
+def _get_click_verify_captcha(file):
+    """点触式验证码"""
+    url = "http://123.57.163.80:2119/v1/images/verify_det"
+    files = _pack_file(file)
+    r = requests.post(url, headers=headers, files=files, stream=True, timeout=10)
+    return r.json()
+
+
+def swordfish_platform(file, mode="simple"):
+    """剑鱼验证码识别平台"""
+    if mode.lower() == "arithmetic":
+        return _arithmetic_captcha(file)
+    elif mode.lower() == "det":
+        return _get_click_verify_captcha(file)
+    else:
+        return _simple_captcha(file)
+
+
+def chaojiying_platform(file, pic_type: int):
+    """
+    超级鹰识别平台
+
+    pic_type,详情查询地址: https://www.chaojiying.com/price.html
+    """
+    with open(file, 'rb') as f:
         image_bytes = f.read()
-    content = {'file': image_bytes}
-    # json_resp = get_verify_code(upload_address, content)
+    files = {'file': image_bytes}
+
+    url = f"http://123.57.163.80:2119/v1/images/discern?pic_type={pic_type}"
     headers = {'accept': 'application/json'}
-    response = requests.post(upload_address, headers=headers, files=content, stream=True)
+    data = {
+        'grant_type': '',
+        'username': 'jianyu001',
+        'password': '123qwe!A',
+        'scope': '',
+        'client_id': '',
+        'client_secret': ''
+    }
+    response = requests.post(url, headers=headers, data=data, files=files, timeout=10)
+    json_resp = response.json()
+    # print(json_resp)
+    '''code 返回0时,打码平台正常返回数据'''
+    pic_str = json_resp["r"]["pic_str"]
+    pic_id = json_resp["r"]["pic_id"]
+    print("pic_id >>", pic_id)
+    if 0 == json_resp["code"]:
+        return pic_str
+
+
+def chaojiying_report(pic_id: str):
+    """超级鹰平台识别验证码错误时,提交识别错误的验证码pic_id"""
+    url = f"http://123.57.163.80:2119/v1/images/report_err?pic_id={pic_id}"
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/x-www-form-urlencoded'
+    }
+    data = {
+        'grant_type': '',
+        'username': 'jianyu001',
+        'password': '123qwe!A',
+        'scope': '',
+        'client_id': '',
+        'client_secret': ''
+    }
+    response = requests.post(url, headers=headers, data=data, timeout=10)
+    '''
+    回调成功:{'msg': 'OK', 'code': 0}  
+    此接口不能随便调用!程序逻辑里要这样判断: 如果 识别结果是错的 再调用 报错返分 接口。 如果没有这个判断或是无法判断,就不要调用!
+    '''
+    # print(response.json())
     return response.json()
 
+
+def get_code(file_path: str) -> dict:
+    return swordfish_platform(file_path) or {}
+
+
 def get_code_det(image_bytes) -> dict:
-   upload_address = "http://123.57.163.80:2119/v1/images/verify_det"
-   content = {'image_content': image_bytes}
-   headers = {'accept': 'application/json'}
-   response = requests.post(upload_address, headers=headers, files=content, stream=True)
-   return response.json()
+    return swordfish_platform(image_bytes, mode="det")
+
 
+# 算术
+def arithmetic_captcha(image_stream):
+    return swordfish_platform(image_stream, mode="arithmetic")

+ 2 - 762
FworkSpider/untils/proxy_pool.py

@@ -1,763 +1,3 @@
-# coding:utf8
-"""
-代理池
-"""
-import datetime
-import json
-import os
-import random
-import socket
-import time
-from urllib import parse
+from feapder.network.proxy_pool import ProxyPool
 
-import redis
-import requests
-
-from feapder import setting
-from feapder.utils import tools
-from feapder.utils.log import log
-
-
-def decrypt(input_str: str) -> str:
-    """
-    改写:新增
-    定义base64解密函数
-
-    :param input_str:
-    :return:
-    """
-    key = "ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/"
-    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
-    output_str = ''
-    # 对前面不是“=”的字节取索引,然后转换为2进制
-    # 补齐“=”的个数
-    equal_num = input_str.count('=')
-    while ascii_list:
-        temp_list = ascii_list[:4]
-        # 转换成2进制字符串
-        temp_str = ''.join(temp_list)
-        # 对没有8位2进制的字符串补够8位2进制
-        if len(temp_str) % 8 != 0:
-            temp_str = temp_str[0:-1 * equal_num * 2]
-        # 4个6字节的二进制  转换  为三个8字节的二进制
-        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
-        # 二进制转为10进制
-        temp_str_list = [int(x, 2) for x in temp_str_list if x]
-        # 连接成字符串
-        output_str += ''.join([chr(x) for x in temp_str_list])
-        ascii_list = ascii_list[4:]
-    return output_str
-
-
-# 建立本地缓存代理文件夹
-proxy_path = os.path.join(os.path.dirname(__file__), "proxy_file")
-if not os.path.exists(proxy_path):
-    os.mkdir(proxy_path)
-
-
-# def get_proxies_by_host(host, port):
-#     proxy_id = "{}:{}".format(host, port)
-#     return get_proxies_by_id(proxy_id)
-
-
-# def get_proxies_by_id(proxy_id):
-#     proxies = {
-#         "http": "http://{}".format(proxy_id),
-#         "https": "https://{}".format(proxy_id),
-#     }
-#     return proxies
-
-
-def get_proxy_from_url(**kwargs):
-    """
-    获取指定url的代理
-    :param kwargs:
-    :return:
-    """
-    proxy_source_url = kwargs.get("proxy_source_url", [])
-    # proxy_source_url = "http://socks.spdata.jianyu360.com/socks/getips?limit=100"
-
-    if not isinstance(proxy_source_url, list):
-        proxy_source_url = [proxy_source_url]
-        proxy_source_url = [x for x in proxy_source_url if x]
-    if not proxy_source_url:
-        raise ValueError("no specify proxy_source_url: {}".format(proxy_source_url))
-    kwargs = kwargs.copy()
-    kwargs.pop("proxy_source_url")
-    proxies_list = []
-    for url in proxy_source_url:
-        if url.startswith("http"):
-            proxies_list.extend(get_proxy_from_http(url, **kwargs))
-        elif url.startswith("redis"):
-            proxies_list.extend(get_proxy_from_redis(url, **kwargs))
-
-    if proxies_list:
-        # 顺序打乱
-        random.shuffle(proxies_list)
-    return proxies_list
-
-
-def get_proxy_from_http(proxy_source_url, **kwargs):
-    """
-    从指定 http 地址获取代理
-    :param proxy_source_url:
-    :param kwargs:
-    :return:
-    """
-    filename = tools.get_md5(proxy_source_url) + ".txt"
-    abs_filename = os.path.join(proxy_path, filename)
-    update_interval = kwargs.get("local_proxy_file_cache_timeout", 30)
-    update_flag = 0
-    if not update_interval:
-        # 强制更新
-        update_flag = 1
-    elif not os.path.exists(abs_filename):
-        # 文件不存在则更新
-        update_flag = 1
-    elif time.time() - os.stat(abs_filename).st_mtime > update_interval:
-        # 超过更新间隔
-        update_flag = 1
-    if update_flag:
-        pool = []
-        response = requests.get(proxy_source_url, timeout=20)
-        # 改写:获取scocks代理的response处理
-        for proxy in response.json():
-            host = decrypt(proxy['host'])
-            port = proxy['port']
-            endTime = proxy['EndTime']
-            pool.append(f"{host}:{port}&&{endTime}")
-
-        with open(os.path.join(proxy_path, filename), "w") as f:
-            f.write('\n'.join(pool))
-    return get_proxy_from_file(filename)
-
-
-def get_proxy_from_file(filename, **kwargs):
-    """
-    从指定本地文件获取代理
-        文件格式
-        ip:port:https
-        ip:port:http
-        ip:port
-    :param filename:
-    :param kwargs:
-    :return:
-    """
-    proxies_list = []
-    with open(os.path.join(proxy_path, filename), "r") as f:
-        lines = f.readlines()
-
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        # 解析
-        auth = ""
-        if "@" in line:
-            auth, line = line.split("@")
-        # 改写,解析代理有效期结束时间
-        line, end = line.split("&&")
-
-        items = line.split(":")
-        if len(items) < 2:
-            continue
-
-        ip, port, *protocol = items
-        if not all([port, ip]):
-            continue
-        if auth:
-            ip = "{}@{}".format(auth, ip)
-        if not protocol:
-            # 改写:判断代理是否在有效期内,并将代理格式重http格式改成socks格式
-            if time.time() < int(end):
-                proxies = {
-                    "https": "socks5://%s:%s" % (ip, port),
-                    "http": "socks5://%s:%s" % (ip, port),
-                    # "end":end
-                }
-            else:
-                continue
-        else:
-            proxies = {protocol[0]: "%s://%s:%s" % (protocol[0], ip, port)}
-        proxies_list.append(proxies)
-
-    return proxies_list
-
-
-def get_proxy_from_redis(proxy_source_url, **kwargs):
-    """
-    从指定 redis 地址获取代理
-    @param proxy_source_url: redis://:passwd@host:ip/db
-        redis 存储结构 zset
-        ip:port ts
-    @param kwargs:
-        {"redis_proxies_key": "xxx"}
-    @return: [{'http':'http://xxx.xxx.xxx:xxx', 'https':'https://xxx.xxx.xxx.xxx:xxx'}]
-    """
-
-    redis_conn = redis.StrictRedis.from_url(proxy_source_url)
-    key = kwargs.get("redis_proxies_key")
-    assert key, "从redis中获取代理 需要指定 redis_proxies_key"
-    proxies = redis_conn.zrange(key, 0, -1)
-    proxies_list = []
-    for proxy in proxies:
-        proxy = proxy.decode()
-        proxies_list.append(
-            {"https": "https://%s" % proxy, "http": "http://%s" % proxy}
-        )
-    return proxies_list
-
-
-def check_proxy(
-        ip="",
-        port="",
-        proxies=None,
-        type=0,
-        timeout=5,
-        logger=None,
-        show_error_log=True,
-        **kwargs,
-):
-    """
-    代理有效性检查
-    :param ip:
-    :param port:
-    :param type: 0:socket  1:requests
-    :param timeout:
-    :param logger:
-    :return:
-    """
-    if not logger:
-        logger = log
-    ok = 0
-    if type == 0 and ip and port:
-        # socket检测成功 不代表代理一定可用 Connection closed by foreign host. 这种情况就不行
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk:
-            sk.settimeout(timeout)
-            try:
-                # 必须检测 否则代理永远不刷新
-                sk.connect((ip, int(port)))
-                ok = 1
-            except Exception as e:
-                if show_error_log:
-                    logger.debug("check proxy failed: {} {}:{}".format(e, ip, port))
-            sk.close()
-    else:
-        if not proxies:
-            proxies = {
-                "http": "socks5://{}:{}".format(ip, port),
-                "https": "socks5//{}:{}".format(ip, port),
-            }
-        try:
-            # 改写:代理检测的url
-            r = requests.get(
-                "https://myip.ipip.net", proxies=proxies, timeout=timeout, stream=True
-            )
-            ok = 1
-            r.close()
-        except Exception as e:
-            if show_error_log:
-                logger.debug(
-                    "check proxy failed: {} {}:{} {}".format(e, ip, port, proxies)
-                )
-    return ok
-
-
-class ProxyItem(object):
-    """单个代理对象"""
-
-    # 代理标记
-    proxy_tag_list = (-1, 0, 1)
-
-    def __init__(
-            self,
-            proxies=None,
-            valid_timeout=20,
-            check_interval=180,
-            max_proxy_use_num=10000,
-            delay=30,
-            use_interval=None,
-            **kwargs,
-    ):
-        """
-        :param proxies:
-        :param valid_timeout:  代理检测超时时间 默认-1    20181008  默认不再监测有效性
-        :param check_interval:
-        :param max_proxy_use_num:
-        :param delay:
-        :param use_interval: 使用间隔 单位秒 默认不限制
-        :param logger: 日志处理器 默认 log.get_logger()
-        :param kwargs:
-        """
-        # {"http": ..., "https": ...}
-        self.proxies = proxies
-        # 检测超时时间 秒
-        self.valid_timeout = valid_timeout
-        # 检测间隔 秒
-        self.check_interval = check_interval
-
-        # 标记  0:正常 -1:丢弃  1: 待会再用 ...
-        self.flag = 0
-        # 上次状态变化时间
-        self.flag_ts = 0
-        # 上次更新时间 有效时间
-        self.update_ts = 0
-        # 最大被使用次数
-        self.max_proxy_use_num = max_proxy_use_num
-        # 被使用次数记录
-        self.use_num = 0
-        # 延迟使用时间
-        self.delay = delay
-        # 使用间隔 单位秒
-        self.use_interval = use_interval
-        # 使用时间
-        self.use_ts = 0
-
-        self.proxy_args = self.parse_proxies(self.proxies)
-        self.proxy_ip = self.proxy_args["ip"]
-        self.proxy_port = self.proxy_args["port"]
-        self.proxy_ip_port = "{}:{}".format(self.proxy_ip, self.proxy_port)
-        if self.proxy_args["user"]:
-            self.proxy_id = "{user}:{password}@{ip}:{port}".format(**self.proxy_args)
-        else:
-            self.proxy_id = self.proxy_ip_port
-
-        # 日志处理器
-        self.logger = log
-
-    def get_proxies(self):
-        self.use_num += 1
-        return self.proxies
-
-    def is_delay(self):
-        return self.flag == 1
-
-    def is_valid(self, force=0, type=0):
-        """
-        检测代理是否有效
-            1 有效
-            2 延时使用
-            0 无效 直接在代理池删除
-        :param force:
-        :param type:
-        :return:
-        """
-        if self.use_num > self.max_proxy_use_num > 0:
-            self.logger.debug("代理达到最大使用次数: {} {}".format(self.use_num, self.proxies))
-            return 0
-        if self.flag == -1:
-            self.logger.debug("代理被标记 -1 丢弃 %s" % self.proxies)
-            return 0
-        if self.delay > 0 and self.flag == 1:
-            if time.time() - self.flag_ts < self.delay:
-                self.logger.debug("代理被标记 1 延迟 %s" % self.proxies)
-                return 2
-            else:
-                self.flag = 0
-                self.logger.debug("延迟代理释放: {}".format(self.proxies))
-        if self.use_interval:
-            if time.time() - self.use_ts < self.use_interval:
-                return 2
-        if not force:
-            if time.time() - self.update_ts < self.check_interval:
-                return 1
-        if self.valid_timeout > 0:
-            ok = check_proxy(
-                proxies=self.proxies,
-                type=type,
-                timeout=self.valid_timeout,
-                logger=self.logger,
-            )
-        else:
-            ok = 1
-        self.update_ts = time.time()
-        return ok
-
-    @classmethod
-    def parse_proxies(self, proxies):
-        """
-        分解代理组成部分
-        :param proxies:
-        :return:
-        """
-        if not proxies:
-            return {}
-        if isinstance(proxies, (str, bytes)):
-            proxies = json.loads(proxies)
-        protocol = list(proxies.keys())
-        if not protocol:
-            return {}
-        _url = proxies.get(protocol[0])
-        # 改写:注释http代理url的拼接,以正常生成代理池
-        # if not _url.startswith("http"):
-        #     _url = "http://" + _url
-        _url_parse = parse.urlparse(_url)
-        netloc = _url_parse.netloc
-        if "@" in netloc:
-            netloc_auth, netloc_host = netloc.split("@")
-        else:
-            netloc_auth, netloc_host = "", netloc
-        ip, *port = netloc_host.split(":")
-        port = port[0] if port else "80"
-        user, *password = netloc_auth.split(":")
-        password = password[0] if password else ""
-        return {
-            "protocol": protocol,
-            "ip": ip,
-            "port": port,
-            "user": user,
-            "password": password,
-            "ip_port": "{}:{}".format(ip, port),
-        }
-
-
-class ProxyPoolBase(object):
-    def __init__(self, *args, **kwargs):
-        pass
-
-    def get(self, *args, **kwargs):
-        raise NotImplementedError
-
-
-class ProxyPool(ProxyPoolBase):
-    """代理池"""
-
-    def __init__(self, **kwargs):
-        """
-        :param size: 代理池大小  -1 为不限制
-        :param proxy_source_url: 代理文件地址 支持列表
-        :param proxy_instance:  提供代理的实例
-        :param reset_interval:  代理池重置间隔 最小间隔
-        :param reset_interval_max:  代理池重置间隔 最大间隔 默认2分钟
-        :param check_valid: 是否在获取代理时进行检测有效性
-        :param local_proxy_file_cache_timeout: 本地缓存的代理文件超时时间
-        :param logger: 日志处理器 默认 log.get_logger()
-        :param kwargs: 其他的参数
-        """
-        kwargs.setdefault("size", -1)
-        kwargs.setdefault("proxy_source_url", setting.PROXY_EXTRACT_API)
-
-        super(ProxyPool, self).__init__(**kwargs)
-        # 队列最大长度
-        self.max_queue_size = kwargs.get("size", -1)
-        # 实际代理数量
-        self.real_max_proxy_count = 1000
-        # 代理可用最大次数
-        # 代理获取地址 http://localhost/proxy.txt
-        self.proxy_source_url = kwargs.get("proxy_source_url", [])
-        if not isinstance(self.proxy_source_url, list):
-            self.proxy_source_url = [self.proxy_source_url]
-            self.proxy_source_url = [x for x in self.proxy_source_url if x]
-            self.proxy_source_url = list(set(self.proxy_source_url))
-            kwargs.update({"proxy_source_url": self.proxy_source_url})
-        # 处理日志
-        self.logger = kwargs.get("logger") or log
-        kwargs["logger"] = self.logger
-        if not self.proxy_source_url:
-            self.logger.warn("need set proxy_source_url or proxy_instance")
-
-        # 代理池重置间隔
-        self.reset_interval = kwargs.get("reset_interval", 5)
-        # 强制重置一下代理 添加新的代理进来 防止一直使用旧的被封的代理
-        self.reset_interval_max = kwargs.get("reset_interval_max", 180)
-        # 是否监测代理有效性
-        self.check_valid = kwargs.get("check_valid", True)
-
-        # 代理队列
-        self.proxy_queue = None
-        # {代理id: ProxyItem, ...}
-        self.proxy_dict = {}
-        # 失效代理队列
-        self.invalid_proxy_dict = {}
-
-        self.kwargs = kwargs
-
-        # 重置代理池锁
-        self.reset_lock = None
-        # 重置时间
-        self.last_reset_time = 0
-        # 重置的太快了  计数
-        self.reset_fast_count = 0
-        # 计数 获取代理重试3次仍然失败 次数
-        self.no_valid_proxy_times = 0
-
-        # 上次获取代理时间
-        self.last_get_ts = time.time()
-
-        # 记录ProxyItem的update_ts 防止由于重置太快导致重复检测有效性
-        self.proxy_item_update_ts_dict = {}
-
-        # 警告
-        self.warn_flag = False
-
-    def warn(self):
-        if not self.warn_flag:
-            for url in self.proxy_source_url:
-                if "zhima" in url:
-                    continue
-            self.warn_flag = True
-        return
-
-    @property
-    def queue_size(self):
-        """
-        当前代理池中代理数量
-        :return:
-        """
-        return self.proxy_queue.qsize() if self.proxy_queue is not None else 0
-
-    def clear(self):
-        """
-        清空自己
-        :return:
-        """
-        self.proxy_queue = None
-        # {代理ip: ProxyItem, ...}
-        self.proxy_dict = {}
-        # 清理失效代理集合
-        _limit = datetime.datetime.now() - datetime.timedelta(minutes=10)
-        self.invalid_proxy_dict = {
-            k: v for k, v in self.invalid_proxy_dict.items() if v > _limit
-        }
-        # 清理超时的update_ts记录
-        _limit = time.time() - 600
-        self.proxy_item_update_ts_dict = {
-            k: v for k, v in self.proxy_item_update_ts_dict.items() if v > _limit
-        }
-        return
-
-    def get(self, retry: int = 0) -> dict:
-        """
-        从代理池中获取代理
-        :param retry:
-        :return:
-        """
-        retry += 1
-        if retry > 3:
-            self.no_valid_proxy_times += 1
-            return None
-        # if time.time() - self.last_get_ts > 3 * 60:
-        #     # 3分钟没有获取过 重置一下
-        #     try:
-        #         self.reset_proxy_pool()
-        #     except Exception as e:
-        #         self.logger.exception(e)
-        # 记录获取时间
-        self.last_get_ts = time.time()
-        #
-        self.warn()
-        proxy_item = self.get_random_proxy()
-        if proxy_item:
-            # 不检测
-            if not self.check_valid:  #
-                # 塞回去
-                proxies = proxy_item.get_proxies()
-                self.put_proxy_item(proxy_item)
-                return proxies
-            else:
-                is_valid = proxy_item.is_valid()
-                if is_valid:
-                    # 记录update_ts
-                    self.proxy_item_update_ts_dict[
-                        proxy_item.proxy_id
-                    ] = proxy_item.update_ts
-                    # 塞回去
-                    proxies = proxy_item.get_proxies()
-                    self.put_proxy_item(proxy_item)
-                    if is_valid == 1:
-                        if proxy_item.use_interval:
-                            proxy_item.use_ts = time.time()
-                        return proxies
-                else:
-                    # 处理失效代理
-                    self.proxy_dict.pop(proxy_item.proxy_id, "")
-                    self.invalid_proxy_dict[
-                        proxy_item.proxy_id
-                    ] = datetime.datetime.now()
-        else:
-            try:
-                time.sleep(3)
-                self.reset_proxy_pool()
-            except Exception as e:
-                self.logger.exception(e)
-        if self.no_valid_proxy_times >= 5:
-            # 解决bug: 当爬虫仅剩一个任务时 由于只有一个线程检测代理 而不可用代理又刚好很多(时间越长越多) 可能出现一直获取不到代理的情况
-            # 导致爬虫烂尾
-            try:
-                time.sleep(3)
-                self.reset_proxy_pool()
-            except Exception as e:
-                self.logger.exception(e)
-        return self.get(retry)
-
-    get_proxy = get
-
-    def get_random_proxy(self) -> ProxyItem:
-        """
-        随机获取代理
-        :return:
-        """
-        if self.proxy_queue is not None:
-            if random.random() < 0.5:
-                # 一半概率检查 这是个高频操作 优化一下
-                if time.time() - self.last_reset_time > self.reset_interval_max:
-                    time.sleep(3)
-                    self.reset_proxy_pool(force=True)
-                else:
-                    min_q_size = (
-                        min(self.max_queue_size / 2, self.real_max_proxy_count / 2)
-                        if self.max_queue_size > 0
-                        else self.real_max_proxy_count / 2
-                    )
-                    if self.proxy_queue.qsize() < min_q_size:
-                        time.sleep(3)
-                        self.reset_proxy_pool()
-            try:
-                return self.proxy_queue.get_nowait()
-            except Exception:
-                pass
-        return None
-
-    def append_proxies(self, proxies_list: list) -> int:
-        """
-        添加代理到代理池
-        :param proxies_list:
-        :return:
-        """
-        count = 0
-        if not isinstance(proxies_list, list):
-            proxies_list = [proxies_list]
-        for proxies in proxies_list:
-            if proxies:
-                proxy_item = ProxyItem(proxies=proxies, **self.kwargs)
-                # 增加失效判断 2018/12/18
-                if proxy_item.proxy_id in self.invalid_proxy_dict:
-                    continue
-                if proxy_item.proxy_id not in self.proxy_dict:
-                    # 补充update_ts
-                    if not proxy_item.update_ts:
-                        proxy_item.update_ts = self.proxy_item_update_ts_dict.get(
-                            proxy_item.proxy_id, 0
-                        )
-                    self.put_proxy_item(proxy_item)
-                    self.proxy_dict[proxy_item.proxy_id] = proxy_item
-                    count += 1
-        return count
-
-    def put_proxy_item(self, proxy_item: ProxyItem):
-        """
-        添加 ProxyItem 到代理池
-        :param proxy_item:
-        :return:
-        """
-        return self.proxy_queue.put_nowait(proxy_item)
-
-    def reset_proxy_pool(self, force: bool = False):
-        """
-        重置代理池
-        :param force: 是否强制重置代理池
-        :return:
-        """
-        if not self.reset_lock:
-            # 必须用时调用 否则 可能存在 gevent patch前 threading就已经被导入 导致的Rlock patch失效
-            import threading
-
-            self.reset_lock = threading.RLock()
-        with self.reset_lock:
-            if (
-                    force
-                    or self.proxy_queue is None
-                    or (
-                    self.max_queue_size > 0
-                    and self.proxy_queue.qsize() < self.max_queue_size / 2
-            )
-                    or (
-                    self.max_queue_size < 0
-                    and self.proxy_queue.qsize() < self.real_max_proxy_count / 2
-            )
-                    or self.no_valid_proxy_times >= 5
-            ):
-                if time.time() - self.last_reset_time < self.reset_interval:
-                    self.reset_fast_count += 1
-                    if self.reset_fast_count % 10 == 0:
-                        self.logger.debug(
-                            "代理池重置的太快了:) {}".format(self.reset_fast_count)
-                        )
-                        time.sleep(1)
-                else:
-                    self.clear()
-                    if self.proxy_queue is None:
-                        import queue
-
-                        self.proxy_queue = queue.Queue()
-                    # TODO 这里获取到的可能重复
-                    proxies_list = get_proxy_from_url(**self.kwargs)
-                    self.real_max_proxy_count = len(proxies_list)
-                    if 0 < self.max_queue_size < self.real_max_proxy_count:
-                        proxies_list = random.sample(proxies_list, self.max_queue_size)
-                    _valid_count = self.append_proxies(proxies_list)
-                    self.last_reset_time = time.time()
-                    self.no_valid_proxy_times = 0
-                    self.logger.debug(
-                        "重置代理池成功: 获取{}, 成功添加{}, 失效{},  当前代理数{},".format(
-                            len(proxies_list),
-                            _valid_count,
-                            len(self.invalid_proxy_dict),
-                            len(self.proxy_dict),
-                        )
-                    )
-        return
-
-    def tag_proxy(self, proxies_list: list, flag: int, *, delay=30) -> bool:
-        """
-        对代理进行标记
-        :param proxies_list:
-        :param flag:
-                    -1  废弃
-                    1 延迟使用
-        :param delay: 延迟时间
-        :return:
-        """
-        if int(flag) not in ProxyItem.proxy_tag_list or not proxies_list:
-            return False
-        if not isinstance(proxies_list, list):
-            proxies_list = [proxies_list]
-        for proxies in proxies_list:
-            if not proxies:
-                continue
-            proxy_id = ProxyItem(proxies).proxy_id
-            if proxy_id not in self.proxy_dict:
-                continue
-            self.proxy_dict[proxy_id].flag = flag
-            self.proxy_dict[proxy_id].flag_ts = time.time()
-            self.proxy_dict[proxy_id].delay = delay
-
-        return True
-
-    def get_proxy_item(self, proxy_id="", proxies=None):
-        """
-        获取代理对象
-        :param proxy_id:
-        :param proxies:
-        :return:
-        """
-        if proxy_id:
-            return self.proxy_dict.get(proxy_id)
-        if proxies:
-            proxy_id = ProxyItem(proxies).proxy_id
-            return self.proxy_dict.get(proxy_id)
-        return
-
-    def copy(self):
-        return ProxyPool(**self.kwargs)
-
-    def all(self) -> list:
-        """
-        获取当前代理池中的全部代理
-        :return:
-        """
-        return get_proxy_from_url(**self.kwargs)
-
-
-if __name__ == '__main__':
-    ProxyPool().get()
+__all__ = ["ProxyPool"]

+ 186 - 123
FworkSpider/untils/tools.py

@@ -1,24 +1,93 @@
+import copy
+import functools
 import hashlib
-import json
 import re
 from collections import namedtuple
-import requests
-from setting import WECHAT_WARNING_URL,WECHAT_WARNING_PHONE,WARNING_INTERVAL,WECHAT_WARNING_ALL
+from string import whitespace
+
 import bson
-from feapder.utils.log import log
-from feapder.db.mongodb import MongoDB
-from .cleaner import cleaner
-import sys
+import requests
+
+from untils.clean_html import cleaner
 
 SearchText = namedtuple('SearchText', ['total'])
 
 
-def substitute(html_str,special=None, completely=False):
+def substitute(html_str, special=None, completely=False):
     """HTML 替换"""
-    html_str = cleaner(html=html_str,special=None, completely=False)
+    html_str = cleaner(html=html_str, special=special, completely=completely)
     return html_str
 
 
+def merge_files(*files):
+    """合并文件"""
+    res = {}
+    for file_ in files:
+        if isinstance(file_, dict):
+            for _, attachment in file_.items():
+                res[str(len(res) + 1)] = attachment
+    return res
+
+
+def is_all_chinese(strs):
+    """检验是否全是中文字符"""
+    for _char in strs:
+        if not '\u4e00' <= _char <= '\u9fa5':
+            return False
+    return True
+
+
+def clean_document(*fields):
+    """
+    清洗mongo文档
+
+    :param fields: 清洗字段
+
+    # 用例:
+    # >>> clean_document('dzr')(lambda *args, **kw: None)(document)
+    """
+
+    def clean(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            defaults = {
+                "_id",
+                "parser_name", "parser_url", "request_params",
+                "failed", "error"
+            }
+            removes = defaults if not fields else {*defaults, *fields}
+            item = args[0] if not kwargs else kwargs
+            data_dict = item if isinstance(item, dict) else item.to_dict
+            copy_data_dict = copy.deepcopy(data_dict)
+            for k in copy_data_dict.keys():
+                if k in removes:
+                    del data_dict[k]
+                    try:
+                        delattr(item, k)  # 删除 Item 类实例属性
+                    except AttributeError:
+                        pass
+            return func(*args, **kwargs)
+
+        return wrapper
+
+    return clean
+
+
+def clean_chars(text, charsets=whitespace):
+    """
+    按照字符集,删除字符
+
+    :param str text: 文本
+    :param charsets: 字符集
+    :return: 干净的文本
+    """
+    if text is not None:
+        for char in charsets:
+            if char in text:
+                text = text.replace(char, '')
+    return text
+
+
 def get_signature(content: str) -> str:
     """
     十六进制数字字符串形式摘要值
@@ -31,6 +100,15 @@ def get_signature(content: str) -> str:
     return sha1.hexdigest()
 
 
+def get_md5(val):
+    md5 = hashlib.md5()
+    if isinstance(val, bytes):
+        md5.update(str(val).encode("utf-8"))
+    elif isinstance(val, str):
+        md5.update(val.encode("utf-8"))
+    return md5.hexdigest()
+
+
 def text_search(content: str) -> SearchText:
     """
     中文检索
@@ -50,115 +128,93 @@ def int2long(param: int):
     """int 转换成 long """
     return bson.int64.Int64(param)
 
-def get_spiders(menus):
-    db = MongoDB(db="editor")
-    for menu in menus:
-        spider_info = db.find('luaconfig',{"code":menu.code})
-        if len(spider_info) >0:
-            if spider_info[0].get("state") not in (11,):
-                menus.remove(menu)
-
-def wechat_warning(
-    message,
-    message_prefix=None,
-    rate_limit=None,
-    url=None,
-    user_phone=None,
-    all_users: bool = None,
-):
-    """企业微信报警"""
-
-    # 为了加载最新的配置
-    rate_limit = rate_limit if rate_limit is not None else WARNING_INTERVAL
-    url = url or WECHAT_WARNING_URL
-    user_phone = user_phone or WECHAT_WARNING_PHONE
-    all_users = all_users if all_users is not None else WECHAT_WARNING_ALL
-
-    if isinstance(user_phone, str):
-        user_phone = [user_phone] if user_phone else []
-
-    if all_users is True or not user_phone:
-        user_phone = ["@all"]
-
-    if not all([url, message]):
-        return
-
-    data = {
-        "msgtype": "text",
-        "text": {"content": message, "mentioned_mobile_list": user_phone},
-    }
 
-    headers = {"Content-Type": "application/json"}
+def njpc_hpsj_filt_keywords(text: str, special_kw=None):
+    if special_kw is None:
+        special_kw = {}
+
+    keywords = {'项目', '工程', '验收', '评价', *special_kw}
 
-    try:
-        response = requests.post(
-            url, headers=headers, data=json.dumps(data).encode("utf8")
-        )
-        result = response.json()
-        response.close()
-        if result.get("errcode") == 0:
-            return True
+    for keyword in keywords:
+        result = re.match(f'.*{keyword}', text, re.S)
+        if result is not None:
+            return True  # 需要采集
+    else:
+        return False     # 丢弃
+
+
+# 拟建爬虫字段正则抽取
+def njpc_fields_extract(html, data_item, is_clean=False):
+    """
+        拟建爬虫字段正则抽取
+    :param str html: 页面源码
+    :param Items data_item: 详情页item
+    :param bool is_clean: 是否对源码进行清洗
+    :return:
+    """
+    if is_clean:
+        html = substitute(html)
+
+    data_item.title = data_item.projectname
+    projectname = re.findall('项目名称(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    approvecode = re.findall('项目代码(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    approvecontent = re.findall('(?:事项名称|审批事项)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    owner = re.findall('建设(?:单位|单位名称)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    projectaddr = re.findall('建设(?:地点|地址)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    total_investment = re.findall('总投资(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    project_person = re.findall('联系人(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    project_phone = re.findall('联系(?:电话|方式)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    approvedept = re.findall('审批部门(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    approvenumber = re.findall('(?:审批|批准)文号(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    approvetime = re.findall('审批时间(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    project_scale = "".join(re.findall('建设(?:内容|内容[及|与|和]规模|规模|规模[及|与|和]内容)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S))
+    project_completedate = re.findall('竣工日期(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+
+    if project_scale:
+        construction_area = search('[总]*\S*建筑[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
+        floor_area = search('[总]*\S*占地[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
+        if not construction_area:
+            construction_area = ""
         else:
-            raise Exception(result.get("errmsg"))
-    except Exception as e:
-        log.error("报警发送失败。 报警内容 {}, error: {}".format(message, e))
-        return False
-
-class JyBasicException(Exception):
-
-    def __init__(self, code: int, reason: str, **kwargs):
-        self.code = code
-        self.reason = reason
-        self.err_details = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-
-class CustomCheckError(JyBasicException):
-
-    def __init__(self, code: int = 10002, reason: str = '特征条件检查异常', **kwargs):
-        self.code = code
-        self.reason = reason
-        self.err_details = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-class HtmlEmptyError(JyBasicException):
-
-    def __init__(self, code: int = 10002, reason: str = '正文获取异常,正文为空', **kwargs):
-        self.code = code
-        self.reason = reason
-        self.err_details = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-
-class CheckPrePareRequest:
-
-    def __init__(self):
-        self.crawl_keywords = {
-            '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询',
-            '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选',
-            '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价',
-            '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正',
-            '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑',
-            '终止', '系统'
-        }
-
-    def check_crawl_title(self, title: str):
-        for keyword in self.crawl_keywords:
-            valid_keyword = re.search(keyword, title)
-            if valid_keyword is not None:
-                break
+            construction_area = re.sub(":|:", "", construction_area)
+
+        if not floor_area:
+            floor_area = ""
         else:
-            # raise CustomCheckError(code=10106, reason='标题未检索到采集关键词', title=title)
-            return 10106,'标题未检索到采集关键词'
-        return 200,'ok'
+            floor_area = re.sub(":|:", "", floor_area)
+
+        data_item.project_scale = project_scale
+        data_item.project_scale_info = {
+            "construction_area": construction_area,
+            "floor_area": floor_area,
+        }  # 建设规模及主要内容
+
+    fields_dict = {
+        "projectname": projectname,
+        "owner": owner,
+        "total_investment": total_investment,
+        "project_person": project_person,
+        "project_phone": project_phone,
+        "approvedept": approvedept,
+        "approvetime": approvetime,
+        "project_completedate": project_completedate,
+        "projectaddr": projectaddr,
+        "approvecode": approvecode,
+        "approvecontent": approvecontent,
+        "approvenumber": approvenumber
+    }
+    for fields_k, fields_v in fields_dict.items():
+        if fields_v:
+            fields_v[0] = clean_chars(fields_v[0])
+            if not fields_v[0]:
+                continue
 
+            data_item[fields_k] = re.sub(
+                r'([,|.|。|)|)|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$',
+                "", fields_v[0])
 
-    def __check(self, rows: dict):
-        title, publish_time = rows['title'], rows['l_np_publishtime']
-        self.check_crawl_title(title)
+    return data_item
 
-    def __call__(self, rows: dict, *args, **kwargs):
-        self.__check(rows)
 
 def get_proxy():
     headers = {
@@ -167,32 +223,39 @@ def get_proxy():
     proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
     print(f"切换代理:{proxy.get('data')}")
     return proxy.get("data").get("http")
-import json
 
-class Obj(object):
-    def __init__(self, dict_):
-        self.__dict__.update(dict_)
-
-def get_argvs():
-    argvs = {"next_page":False,"max_page":10}
-    for item in sys.argv[1:]:
-        print(item)
-        if item.startswith("--"):
-            argvs[item.replace("--", "").split('=')[0]] = int(item.split('=')[-1])
-    return json.loads(json.dumps(argvs), object_hook=Obj)
 
 def search(pattern, string):
     result = re.search(pattern, string)
     if result:
         return result.groups()[0]
 
+
 def search_construction(string):
     result = re.search('pattern', string)
     if result:
         return result.groups()[0]
 
+
 def search_floor(string):
     result = re.search('pattern', string)
     if result:
         return result.groups()[0]
 
+
+def get_floor_area(project_scale):
+    floor_area = search('[总]*\S*占地[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
+    if not floor_area:
+        floor_area = ""
+    else:
+        floor_area = floor_area.replace(':', '').replace(':', '')
+    return floor_area
+
+
+def get_construction_area(project_scale):
+    construction_area = search('[总]*\S*建筑[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
+    if not construction_area:
+        construction_area = ""
+    else:
+        construction_area = construction_area.replace(':', '').replace(':', '')
+    return construction_area

+ 0 - 0
NoteWork/python乱码识别/__init__.py


+ 0 - 0
NoteWork/文档/img.png


+ 0 - 0
NoteWork/文档/img_1.png


+ 0 - 0
NoteWork/文档/img_10.png


+ 0 - 0
NoteWork/文档/img_11.png


+ 0 - 0
NoteWork/文档/img_12.png


+ 0 - 0
NoteWork/文档/img_13.png


+ 0 - 0
NoteWork/文档/img_2.png


+ 0 - 0
NoteWork/文档/img_3.png


+ 0 - 0
NoteWork/文档/img_4.png


+ 0 - 0
NoteWork/文档/img_5.png


+ 0 - 0
NoteWork/文档/img_6.png


+ 0 - 0
NoteWork/文档/img_7.png


+ 0 - 0
NoteWork/文档/img_8.png


+ 0 - 0
NoteWork/文档/img_9.png


+ 0 - 29
NoteWork/文档/update.md

@@ -1,29 +0,0 @@
-### 1、快照页:已完成
-    附件采集方法        
-### 2、关联lua爬虫接口:已完成
-    待开发爬虫任务管理  #
-### 3、报警修改   爬虫报警规则:已完成
-	1、失败一定次数  
-	2、爬虫当前任务成功率过低
-	3、爬虫导出数据失败一定次数
-	4、爬虫任务停滞
-	5、爬虫异常停止
-```python
-
-
-```
-
-### 4、爬虫校验,同时只运行一个  *无需修改 
-    爬虫为分布式爬虫,后续新建任务不会重新运行,
-	会读取当前爬虫中未完成的任务,协同执行  
-    /// 若两个爬虫同一时间开始执行,这时无法处理 
-
-
-### 5、重新采集的一个字段  :已完成
-    关于正文/其他数据采集为空,这里进行了处理,停止当前管道线程,把其当做错误请求处理,
-	五次容错机会,五次均失败后丢弃当前连接,等待下一轮爬虫执行时重试
-
-### 6、快速定位,项目爬虫代码、指定人员  :可指定人员、获取爬虫名称,但无法直接跳转到爬虫文件 
-        可分层级,每个角色-单独一个爬虫,按地区分文件 以便快速查找爬虫文件
-### 7、管理平台消息自定义  无需改动
-    # 两个消息发送方式:爬虫结束时发送,爬虫异常结束时报错

+ 0 - 108
NoteWork/文档/开发文档.md

@@ -1,108 +0,0 @@
-
-## feapder爬虫开发文档
-#### 本地调试环境安装
-    python环境,python安装
-    redis + mongo
-#### 创建爬虫
-    命令创建  create -s ***** 4
-
-### 编辑爬虫
-1、编辑站点信息、栏目信息等基础等
-```python
-	def start_callback(self):
-		Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-	    self.site= "测试地址平台采集"
-        self.menus = [
-             # Menu('Ceshidizhi抓取栏目', 'Ceshidizhi爬虫code', "自定义参数", 1),
-             # Menu('Ceshidizhi抓取栏目', 'Ceshidizhi爬虫code', "Notice", 1),
-             Menu('政府采购-采购公告', 'hn_ceshidizhi_zfcg_cggg', "zfcg/cggg", 1),
-             Menu('综合其他-中标前公示', 'hn_ceshidizhi_zhqt_zbqgs', "zhqt/zbqgs", 1),
-         ]
-         
-	def start_requests(self):
-    	for menu in self.menus:
-        	for page in range(1,menu.crawl_page+1):
-            	start_url = f'http://www.ceshi.com/{menu.types}'
-	            yield feapder.Request(url=start_url, item=menu._asdict(), proxies=False)
-```
-
-2、根据栏目信息,配置相对应的起始连接,代理ip默认为未启用,如需启用代理,将Proxies修改为True
-
-3、编辑列表页解析的xpath规则/json解析字段
-```python
-    def parse(self, request, response): #xpath方式:
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []  # 不用修改
-        info_list = response.xpath("//table[@class='p1']/tr[position()>1]")
-        for info in info_list:
-            href = info.xpath('./td[1]/a/@href').extract_first().strip()
-            title = info.xpath('./td[1]/a/text()').extract_first().strip()
-            
-            '''保证时间格式为 0000-00-00 00:00:00 或 0000-00-00格式'''
-            create_time = info.xpath('./td[5]/text()').extract_first().strip()
-            
-            '''如果有省市信息,一定要按具体规则解析或切割省市信息'''
-            area = info.xpath('./td[4]/text()').extract_first()
-            city = info.xpath('./td[4]/text()').extract_first()   #城市
-            area = area if area else "全国"   #省份
-            city = city if city else ""   #省份
-            
-    def parse(self, request, response): #json方式:
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.json.get("data").get("list")
-        for info in info_list:
-            href = info.get("href")
-            title = info.get("title")
-            crate_time = info.get("create_time")
-            area = info.get("area")
-            city = info.get("city")
-            area = area if area else "全国"  # 省份
-            city = city if city else ""  # 城市
-```
-
-4、编辑详情页解析的xpath规则/部分代码编写
-```python
-    list_item =  MgpListItem()
-    list_item.parse = "self.detail_get"
-    list_item.parser_name = "details"
-    list_item.item = data_item.to_dict
-    list_item.deal_detail ['//div[@class="content"]']
-    list_item.proxies = False
-    list_item.parse_url = href
-    list_item.author = 'mgp' # 自定author,如无author,则根据文件夹的名称自成
-    list_item.pri = 1
-    list.files={
-        "list_xpath":'//div[@class="notice-foot"]/a',
-        "url_xpath":'./@href',
-        "name_xpath":'./text()',
-        "files_type":('zip','doxc','ftp'), # 需要下载的附件类型
-        "file_type":'docx',   # 默认的附件类型,用于url中未带附件类型的
-        "url_key":'http',  # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
-        "host":'http://www.ceshi.com',  # 需要拼接url的host
-            }
-    href_list.append(href)
-    yield list_item
-```
-
-### 部署爬虫
-    1、将编辑好的爬虫放到自己的爬虫文件夹之下,下面是示例
-![在这里插入图片描述](https://img-blog.csdnimg.cn/061efe986db8402bb13b482c8d447f91.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBA56u55LmL56yR,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center)
-
-![在这里插入图片描述](https://img-blog.csdnimg.cn/75d4c7851a2e435cafac29f627faaa4b.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBA56u55LmL56yR,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center)
-
-    2、根据爬虫数据采集量创建定时任务
-![在这里插入图片描述](https://img-blog.csdnimg.cn/227f32935f8e4f4fa6b19bea96805b37.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBA56u55LmL56yR,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center)
-![在这里插入图片描述](https://img-blog.csdnimg.cn/3f4e2bffe2e042eca0cbc35b99817f81.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBA56u55LmL56yR,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center)
-
-
-
-    3、创建好定时任务后点击启用即可
-
-![在这里插入图片描述](https://img-blog.csdnimg.cn/ffe8e2ec981d4f798b7efa44406926be.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBA56u55LmL56yR,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center)
-
-
-
-

+ 1 - 2
README.md

@@ -1,2 +1 @@
-# 
-
+# 

部分文件因文件數量過多而無法顯示