Browse Source

update:优化若干内容与方法

dongzhaorui 2 years ago
parent
commit
73209a47a7
72 changed files with 2443 additions and 4924 deletions
  1. 15 2
      FworkSpider/feapder/VERSION
  2. 29 33
      FworkSpider/feapder/buffer/__init__.py
  3. 5 5
      FworkSpider/feapder/buffer/request_buffer.py
  4. 14 8
      FworkSpider/feapder/commands/create/create_spider.py
  5. 5 5
      FworkSpider/feapder/commands/shell.py
  6. 10 64
      FworkSpider/feapder/core/__init__.py
  7. 42 25
      FworkSpider/feapder/core/collector.py
  8. 1 1
      FworkSpider/feapder/core/handle_failed_requests.py
  9. 499 453
      FworkSpider/feapder/core/parser_control.py
  10. 109 97
      FworkSpider/feapder/core/scheduler.py
  11. 16 2
      FworkSpider/feapder/core/spiders/__init__.py
  12. 3 18
      FworkSpider/feapder/core/spiders/air_spider.py
  13. 149 16
      FworkSpider/feapder/core/spiders/spider.py
  14. 1 5
      FworkSpider/feapder/db/__init__.py
  15. 107 76
      FworkSpider/feapder/db/mysqldb.py
  16. 14 20
      FworkSpider/feapder/dedup/bitarray.py
  17. 13 2
      FworkSpider/feapder/dedup/expirefilter.py
  18. 0 178
      FworkSpider/feapder/dedup/old__init__.py
  19. 40 73
      FworkSpider/feapder/network/__init__.py
  20. 0 20
      FworkSpider/feapder/network/item.py
  21. 0 20
      FworkSpider/feapder/network/proxy_file/a62f3217a0981b7b2117d9d0af64c2db.txt
  22. 5 9
      FworkSpider/feapder/network/proxy_pool.py
  23. 23 37
      FworkSpider/feapder/network/request.py
  24. 0 513
      FworkSpider/feapder/network/request6.29.py
  25. 2 1
      FworkSpider/feapder/network/response.py
  26. 6 4
      FworkSpider/feapder/setting.py
  27. 98 30
      FworkSpider/feapder/templates/air_spider_template.tmpl
  28. 61 45
      FworkSpider/feapder/templates/project_template/CHECK_DATA.md
  29. 0 177
      FworkSpider/feapder/utils/__init__.py
  30. 1 1
      FworkSpider/feapder/utils/custom_argparse.py
  31. 53 41
      FworkSpider/feapder/utils/js/stealth.min.js
  32. 14 8
      FworkSpider/feapder/utils/metrics.py
  33. 37 32
      FworkSpider/feapder/utils/tools.py
  34. 167 72
      FworkSpider/feapder/utils/webdriver.py
  35. 71 53
      FworkSpider/items/__init__.py
  36. 115 111
      FworkSpider/items/spider_item.py
  37. 0 0
      FworkSpider/login_pool/__init__.py
  38. 0 95
      FworkSpider/login_pool/zglbw.py
  39. 0 56
      FworkSpider/mongo_pipeline.py
  40. 0 98
      FworkSpider/mongo_pipeline_old.py
  41. 89 151
      FworkSpider/setting.py
  42. 13 30
      FworkSpider/untils/WebCookiePool.py
  43. 13 2
      FworkSpider/untils/__init__.py
  44. 212 169
      FworkSpider/untils/attachment.py
  45. 0 61
      FworkSpider/untils/chaojiying.py
  46. 0 0
      FworkSpider/untils/clean_html/__init__.py
  47. 0 131
      FworkSpider/untils/clean_html/defaults.py
  48. 0 136
      FworkSpider/untils/cleaner.py
  49. 62 654
      FworkSpider/untils/cookie_pool.py
  50. 0 33
      FworkSpider/untils/create_menus.py
  51. 11 15
      FworkSpider/untils/execptions.py
  52. 129 12
      FworkSpider/untils/get_imgcode.py
  53. 2 762
      FworkSpider/untils/proxy_pool.py
  54. 186 123
      FworkSpider/untils/tools.py
  55. 0 0
      NoteWork/python乱码识别/__init__.py
  56. 0 0
      NoteWork/文档/img.png
  57. 0 0
      NoteWork/文档/img_1.png
  58. 0 0
      NoteWork/文档/img_10.png
  59. 0 0
      NoteWork/文档/img_11.png
  60. 0 0
      NoteWork/文档/img_12.png
  61. 0 0
      NoteWork/文档/img_13.png
  62. 0 0
      NoteWork/文档/img_2.png
  63. 0 0
      NoteWork/文档/img_3.png
  64. 0 0
      NoteWork/文档/img_4.png
  65. 0 0
      NoteWork/文档/img_5.png
  66. 0 0
      NoteWork/文档/img_6.png
  67. 0 0
      NoteWork/文档/img_7.png
  68. 0 0
      NoteWork/文档/img_8.png
  69. 0 0
      NoteWork/文档/img_9.png
  70. 0 29
      NoteWork/文档/update.md
  71. 0 108
      NoteWork/文档/开发文档.md
  72. 1 2
      README.md

+ 15 - 2
FworkSpider/feapder/VERSION

@@ -7,8 +7,9 @@ Created on 2020/4/21 10:41 PM
 @author: Boris
 @author: Boris
 @email: boris_liu@foxmail.com
 @email: boris_liu@foxmail.com
 """
 """
-import os, sys
+import os
 import re
 import re
+import sys
 
 
 sys.path.insert(0, re.sub(r"([\\/]items$)|([\\/]spiders$)", "", os.getcwd()))
 sys.path.insert(0, re.sub(r"([\\/]items$)|([\\/]spiders$)", "", os.getcwd()))
 
 
@@ -16,6 +17,10 @@ __all__ = [
     "AirSpider",
     "AirSpider",
     "Spider",
     "Spider",
     "BatchSpider",
     "BatchSpider",
+    "BiddingListSpider",
+    "BiddingDetailSpider",
+    "PlanToBuildListSpider",
+    "PlanToBuildDetailSpider",
     "BaseParser",
     "BaseParser",
     "BatchParser",
     "BatchParser",
     "Request",
     "Request",
@@ -25,7 +30,15 @@ __all__ = [
     "ArgumentParser",
     "ArgumentParser",
 ]
 ]
 
 
-from feapder.core.spiders import Spider, BatchSpider, AirSpider
+from feapder.core.spiders import (
+    Spider,
+    BatchSpider,
+    AirSpider,
+    BiddingListSpider,
+    BiddingDetailSpider,
+    PlanToBuildListSpider,
+    PlanToBuildDetailSpider,
+)
 from feapder.core.base_parser import BaseParser, BatchParser
 from feapder.core.base_parser import BaseParser, BatchParser
 from feapder.network.request import Request
 from feapder.network.request import Request
 from feapder.network.response import Response
 from feapder.network.response import Response

+ 29 - 33
FworkSpider/feapder/buffer/__init__.py

@@ -43,7 +43,7 @@ class ItemBuffer(threading.Thread):
 
 
             self._items_queue = Queue(maxsize=MAX_ITEM_COUNT)
             self._items_queue = Queue(maxsize=MAX_ITEM_COUNT)
 
 
-            self._table_request = setting.TAB_REQUSETS.format(redis_key=redis_key)
+            self._table_request = setting.TAB_REQUESTS.format(redis_key=redis_key)
             self._table_failed_items = setting.TAB_FAILED_ITEMS.format(
             self._table_failed_items = setting.TAB_FAILED_ITEMS.format(
                 redis_key=redis_key
                 redis_key=redis_key
             )
             )
@@ -99,9 +99,9 @@ class ItemBuffer(threading.Thread):
 
 
         return self._mysql_pipeline
         return self._mysql_pipeline
 
 
-    def run(self): # step 1 开始
+    def run(self):
         self._thread_stop = False
         self._thread_stop = False
-        while not self._thread_stop: # 爬虫不停止,就一直循环刷新
+        while not self._thread_stop:
             self.flush()
             self.flush()
             tools.delay_time(1)
             tools.delay_time(1)
 
 
@@ -111,18 +111,14 @@ class ItemBuffer(threading.Thread):
         self._thread_stop = True
         self._thread_stop = True
         self._started.clear()
         self._started.clear()
 
 
-    def put_item(self, item): # step 存储数据的入口 将需要存储的数据放入数据管道队列
+    def put_item(self, item):
         if isinstance(item, Item):
         if isinstance(item, Item):
             # 入库前的回调
             # 入库前的回调
-
-            if item.item_name == "ListItem":  # 测试框架有用,对listitem不进行存储,正式框架没有这个判断
-                return
             item.pre_to_db()
             item.pre_to_db()
-            # print(item)
-            if item.save: # 根据save字段,判断该条信息是否存储
-                self._items_queue.put(item)
-        else:
+
+        if getattr(item, "save", True):  # save=False 不推送入库
             self._items_queue.put(item)
             self._items_queue.put(item)
+
     def flush(self):
     def flush(self):
         try:
         try:
             items = []
             items = []
@@ -132,26 +128,26 @@ class ItemBuffer(threading.Thread):
             items_fingerprints = []
             items_fingerprints = []
             data_count = 0
             data_count = 0
 
 
-            while not self._items_queue.empty(): # step 2 数据管道队列不为空时时 不等待直接取值
-                data = self._items_queue.get_nowait() # 队列的 不等待直接取值方法,类似get
+            while not self._items_queue.empty():
+                data = self._items_queue.get_nowait()
                 data_count += 1
                 data_count += 1
 
 
                 # data 分类
                 # data 分类
                 if callable(data):
                 if callable(data):
                     callbacks.append(data)
                     callbacks.append(data)
 
 
-                elif isinstance(data, UpdateItem): # 更新型数据,走更新管道,采集框架只存不更新,可以忽略不看
+                elif isinstance(data, UpdateItem):
                     update_items.append(data)
                     update_items.append(data)
 
 
                 elif isinstance(data, Item):
                 elif isinstance(data, Item):
                     items.append(data)
                     items.append(data)
-                    if setting.ITEM_FILTER_ENABLE: # item去重,对于当前框架,无效,不看
+                    if setting.ITEM_FILTER_ENABLE:
                         items_fingerprints.append(data.fingerprint)
                         items_fingerprints.append(data.fingerprint)
 
 
                 else:  # request-redis
                 else:  # request-redis
                     requests.append(data)
                     requests.append(data)
 
 
-                if data_count >= UPLOAD_BATCH_MAX_SIZE: # step 3 需要存储的数据,达到一定数量后,统一存储
+                if data_count >= UPLOAD_BATCH_MAX_SIZE:
                     self.__add_item_to_db(
                     self.__add_item_to_db(
                         items, update_items, requests, callbacks, items_fingerprints
                         items, update_items, requests, callbacks, items_fingerprints
                     )
                     )
@@ -163,7 +159,7 @@ class ItemBuffer(threading.Thread):
                     items_fingerprints = []
                     items_fingerprints = []
                     data_count = 0
                     data_count = 0
 
 
-            if data_count: # step 3 管道为空后,将剩余的数据,统一存储
+            if data_count:
                 self.__add_item_to_db(
                 self.__add_item_to_db(
                     items, update_items, requests, callbacks, items_fingerprints
                     items, update_items, requests, callbacks, items_fingerprints
                 )
                 )
@@ -248,11 +244,8 @@ class ItemBuffer(threading.Thread):
         return datas_dict
         return datas_dict
 
 
     def __export_to_db(self, table, datas, is_update=False, update_keys=()):
     def __export_to_db(self, table, datas, is_update=False, update_keys=()):
-        # step 3.1.1 打点 记录总条数及每个key情况
-        self.check_datas(table=table, datas=datas)
-
-        for pipeline in self._pipelines: # setting 配置的piplines方法
-            if is_update: # 更新方法 不看
+        for pipeline in self._pipelines:
+            if is_update:
                 if table == self._task_table and not isinstance(
                 if table == self._task_table and not isinstance(
                     pipeline, MysqlPipeline
                     pipeline, MysqlPipeline
                 ):
                 ):
@@ -265,7 +258,7 @@ class ItemBuffer(threading.Thread):
                     return False
                     return False
 
 
             else:
             else:
-                if not pipeline.save_items(table, datas): # step 3.1.2 调用pipline的 save_items 方法
+                if not pipeline.save_items(table, datas):
                     log.error(
                     log.error(
                         f"{pipeline.__class__.__name__} 保存数据失败. table: {table}  items: {datas}"
                         f"{pipeline.__class__.__name__} 保存数据失败. table: {table}  items: {datas}"
                     )
                     )
@@ -281,19 +274,22 @@ class ItemBuffer(threading.Thread):
                 )
                 )
                 return False
                 return False
 
 
+        self.metric_datas(table=table, datas=datas)
         return True
         return True
 
 
+    def export_to_db(self, table, datas, **kwargs):
+        return self.__export_to_db(table, datas, **kwargs)
+
     def __add_item_to_db(
     def __add_item_to_db(
         self, items, update_items, requests, callbacks, items_fingerprints
         self, items, update_items, requests, callbacks, items_fingerprints
     ):
     ):
         export_success = True
         export_success = True
         self._is_adding_to_db = True
         self._is_adding_to_db = True
 
 
-        # 去重 item去重,不看
         if setting.ITEM_FILTER_ENABLE:
         if setting.ITEM_FILTER_ENABLE:
             items, items_fingerprints = self.__dedup_items(items, items_fingerprints)
             items, items_fingerprints = self.__dedup_items(items, items_fingerprints)
 
 
-        # step 分捡 将每个表之间的数据分开 拆分后 原items为空
+        # 分捡
         items_dict = self.__pick_items(items)
         items_dict = self.__pick_items(items)
         update_items_dict = self.__pick_items(update_items, is_update_item=True)
         update_items_dict = self.__pick_items(update_items, is_update_item=True)
 
 
@@ -311,7 +307,7 @@ class ItemBuffer(threading.Thread):
                 % (table, tools.dumps_json(datas, indent=16))
                 % (table, tools.dumps_json(datas, indent=16))
             )
             )
 
 
-            if not self.__export_to_db(table, datas): # step 3.1 导出到数据库
+            if not self.__export_to_db(table, datas):
                 export_success = False
                 export_success = False
                 failed_items["add"].append({"table": table, "datas": datas})
                 failed_items["add"].append({"table": table, "datas": datas})
 
 
@@ -336,7 +332,7 @@ class ItemBuffer(threading.Thread):
                 failed_items["update"].append({"table": table, "datas": datas})
                 failed_items["update"].append({"table": table, "datas": datas})
 
 
         if export_success:
         if export_success:
-            # step 3.2 保存成功后,执行的执行回调
+            # 执行回调
             while callbacks:
             while callbacks:
                 try:
                 try:
                     callback = callbacks.pop(0)
                     callback = callbacks.pop(0)
@@ -344,17 +340,15 @@ class ItemBuffer(threading.Thread):
                 except Exception as e:
                 except Exception as e:
                     log.exception(e)
                     log.exception(e)
 
 
-            # step 删除做过的request
+            # 删除做过的request
             if requests:
             if requests:
                 self.redis_db.zrem(self._table_request, requests)
                 self.redis_db.zrem(self._table_request, requests)
 
 
-            # 去重入库 不走这个去重
+            # 去重入库
             if setting.ITEM_FILTER_ENABLE:
             if setting.ITEM_FILTER_ENABLE:
                 if items_fingerprints:
                 if items_fingerprints:
                     self.__class__.dedup.add(items_fingerprints, skip_check=True)
                     self.__class__.dedup.add(items_fingerprints, skip_check=True)
         else:
         else:
-            # step 3.2 保存失败后,执行的执行回调
-
             failed_items["requests"] = requests
             failed_items["requests"] = requests
 
 
             if self.export_retry_times > setting.EXPORT_DATA_MAX_RETRY_TIMES:
             if self.export_retry_times > setting.EXPORT_DATA_MAX_RETRY_TIMES:
@@ -412,17 +406,19 @@ class ItemBuffer(threading.Thread):
 
 
         self._is_adding_to_db = False
         self._is_adding_to_db = False
 
 
-    def check_datas(self, table, datas):
+    def metric_datas(self, table, datas):
         """
         """
         打点 记录总条数及每个key情况
         打点 记录总条数及每个key情况
         @param table: 表名
         @param table: 表名
         @param datas: 数据 列表
         @param datas: 数据 列表
         @return:
         @return:
         """
         """
-        metrics.emit_counter("total count", len(datas), classify=table)
+        total_count = 0
         for data in datas:
         for data in datas:
+            total_count += 1
             for k, v in data.items():
             for k, v in data.items():
                 metrics.emit_counter(k, int(bool(v)), classify=table)
                 metrics.emit_counter(k, int(bool(v)), classify=table)
+        metrics.emit_counter("total count", total_count, classify=table)
 
 
     def close(self):
     def close(self):
         # 调用pipeline的close方法
         # 调用pipeline的close方法

+ 5 - 5
FworkSpider/feapder/buffer/request_buffer.py

@@ -34,8 +34,8 @@ class RequestBuffer(threading.Thread):
             self._del_requests_deque = collections.deque()
             self._del_requests_deque = collections.deque()
             self._db = RedisDB()
             self._db = RedisDB()
 
 
-            self._table_request = setting.TAB_REQUSETS.format(redis_key=redis_key)
-            self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
+            self._table_request = setting.TAB_REQUESTS.format(redis_key=redis_key)
+            self._table_failed_request = setting.TAB_FAILED_REQUESTS.format(
                 redis_key=redis_key
                 redis_key=redis_key
             )
             )
 
 
@@ -44,9 +44,9 @@ class RequestBuffer(threading.Thread):
                     name=redis_key, to_md5=False, **setting.REQUEST_FILTER_SETTING
                     name=redis_key, to_md5=False, **setting.REQUEST_FILTER_SETTING
                 )  # 默认过期时间为一个月
                 )  # 默认过期时间为一个月
 
 
-    def run(self): # step 1 线程入口
+    def run(self):
         self._thread_stop = False
         self._thread_stop = False
-        while not self._thread_stop: # 每隔一分钟进行一次 将产生的任务存储
+        while not self._thread_stop:
             try:
             try:
                 self.__add_request_to_db()
                 self.__add_request_to_db()
             except Exception as e:
             except Exception as e:
@@ -94,7 +94,7 @@ class RequestBuffer(threading.Thread):
         callbacks = []
         callbacks = []
 
 
         while self._requests_deque:
         while self._requests_deque:
-            request = self._requests_deque.popleft() # 从任务队列中从左取任务(先进先出)
+            request = self._requests_deque.popleft()
             self._is_adding_to_db = True
             self._is_adding_to_db = True
 
 
             if callable(request):
             if callable(request):

+ 14 - 8
FworkSpider/feapder/commands/create/create_spider.py

@@ -16,10 +16,10 @@ import feapder.utils.tools as tools
 from .create_init import CreateInit
 from .create_init import CreateInit
 
 
 
 
-def deal_file_info(file):
+def deal_file_info(file, author):
     file = file.replace("{DATE}", tools.get_current_date())
     file = file.replace("{DATE}", tools.get_current_date())
-    file = file.replace("{USER}", getpass.getuser())
-
+    # file = file.replace("{USER}", getpass.getuser())
+    file = file.replace("{USER}", author)
     return file
     return file
 
 
 
 
@@ -57,8 +57,14 @@ class CreateSpider:
             template_path = "batch_spider_template.tmpl"
             template_path = "batch_spider_template.tmpl"
         elif spider_type == 4:
         elif spider_type == 4:
             template_path = "spider_list_template.tmpl"
             template_path = "spider_list_template.tmpl"
+        elif spider_type == 5:
+            template_path = "detail_template.tmpl"
+        elif spider_type == 6:
+            template_path = "njpc_list_template.tmpl"
+        elif spider_type == 7:
+            template_path = "njpc_detail_template.tmpl"
         else:
         else:
-            raise ValueError("spider type error, support 1 2 3")
+            raise ValueError("spider type error, support 1 2 3 4 5 6 7")
 
 
         template_path = os.path.abspath(
         template_path = os.path.abspath(
             os.path.join(__file__, "../../../templates", template_path)
             os.path.join(__file__, "../../../templates", template_path)
@@ -68,9 +74,9 @@ class CreateSpider:
 
 
         return spider_template
         return spider_template
 
 
-    def create_spider(self, spider_template, spider_name):
+    def create_spider(self, spider_template, spider_name, author):
         spider_template = spider_template.replace("${spider_name}", spider_name)
         spider_template = spider_template.replace("${spider_name}", spider_name)
-        spider_template = deal_file_info(spider_template)
+        spider_template = deal_file_info(spider_template, author)
         return spider_template
         return spider_template
 
 
     def save_spider_to_file(self, spider, spider_name):
     def save_spider_to_file(self, spider, spider_name):
@@ -89,7 +95,7 @@ class CreateSpider:
 
 
         self._create_init.create()
         self._create_init.create()
 
 
-    def create(self, spider_name, spider_type):
+    def create(self, spider_name, spider_type, author):
         # 检查spider_name
         # 检查spider_name
         if not re.search("^[a-zA-Z][a-zA-Z0-9_]*$", spider_name):
         if not re.search("^[a-zA-Z][a-zA-Z0-9_]*$", spider_name):
             raise Exception("爬虫名不符合命名规范,请用下划线命名或驼峰命名方式")
             raise Exception("爬虫名不符合命名规范,请用下划线命名或驼峰命名方式")
@@ -97,5 +103,5 @@ class CreateSpider:
         if spider_name.islower():
         if spider_name.islower():
             spider_name = tools.key2hump(spider_name)
             spider_name = tools.key2hump(spider_name)
         spider_template = self.get_spider_template(spider_type)
         spider_template = self.get_spider_template(spider_type)
-        spider = self.create_spider(spider_template, spider_name)
+        spider = self.create_spider(spider_template, spider_name, author)
         self.save_spider_to_file(spider, spider_name)
         self.save_spider_to_file(spider, spider_name)

+ 5 - 5
FworkSpider/feapder/commands/shell.py

@@ -58,13 +58,13 @@ def fetch_curl(curl_args):
 
 
 def usage():
 def usage():
     """
     """
-下载调试器
+    下载调试器
 
 
-usage: feapder shell [options] [args]
+    usage: feapder shell [options] [args]
 
 
-optional arguments:
-  -u, --url     抓取指定url
-  -c, --curl    抓取curl格式的请求
+    optional arguments:
+      -u, --url     抓取指定url
+      -c, --curl    抓取curl格式的请求
 
 
     """
     """
     print(usage.__doc__)
     print(usage.__doc__)

+ 10 - 64
FworkSpider/feapder/core/__init__.py

@@ -8,15 +8,11 @@ Created on 2018-07-25 11:41:57
 @email:  boris_liu@foxmail.com
 @email:  boris_liu@foxmail.com
 """
 """
 import os
 import os
-import traceback
 
 
-import feapder
 import feapder.utils.tools as tools
 import feapder.utils.tools as tools
 from feapder.db.mysqldb import MysqlDB
 from feapder.db.mysqldb import MysqlDB
 from feapder.network.item import UpdateItem
 from feapder.network.item import UpdateItem
 from feapder.utils.log import log
 from feapder.utils.log import log
-from feapder.utils.aliyun import UploadOSS
-from feapder.db.redisdb import RedisDB
 
 
 
 
 class BaseParser(object):
 class BaseParser(object):
@@ -30,6 +26,16 @@ class BaseParser(object):
 
 
         pass
         pass
 
 
+        """
+        @summary: 下载中间件 可修改请求的一些参数, 或可自定义下载,然后返回 request, response
+        ---------
+        @param request:
+        ---------
+        @result: return request / request, response
+        """
+
+        pass
+
     def download_midware(self, request):
     def download_midware(self, request):
         """
         """
         @summary: 下载中间件 可修改请求的一些参数, 或可自定义下载,然后返回 request, response
         @summary: 下载中间件 可修改请求的一些参数, 或可自定义下载,然后返回 request, response
@@ -91,66 +97,6 @@ class BaseParser(object):
         """
         """
 
 
         pass
         pass
-    def infinite_crawl(self,request,response):
-        menu = request.item
-        list_item = request.list_item
-        if self.platform_next_page:  # real_page为连续翻页采集为0
-            if getattr(request, 'real_page', None) is not None:
-                request.real_page = 0
-
-            request.real_page += 1
-            if list_item.rel_count > 0:
-                request.real_page = 0
-
-            if request.real_page <= 5 and request.page < self.platform_max_page:
-                request.page += 1
-                request.callback = self.parse
-                if getattr(request, 'new_callback', None) is not None:
-                    request.callback = eval(request.new_callback)
-                    yield request
-        else:
-            if request.page < menu.get("crawl_page"):
-                request.page += 1
-                request.callback = self.parse
-                if getattr(request, 'new_callback', None) is not None:
-                    request.callback = eval(request.new_callback)
-                    yield request
-
-    def push_files(self, request, response):
-        """
-        @summary: 下载 并上传附件文件,传进来的request的auto_request必须为False,否则可能会因为响应失败而无法下载文件
-        ---------
-        @param request:  request.url 为文件下载地址, 该方法需要自行调用
-        request.INFO  为上传文件时所需要提供的部分参数  必传
-         info = {
-            "org_url": "http://www...",  # 文件下载连接
-            "filename": f"{list_item.title}.docx",  # 文件名
-            "channel": list_item.channel,
-            "ftype": 'docx,zip,ftp', # 文件类型
-        }
-        request.headers 则存放请求的必要参数,如:parmas,headers  必传
-        ---------
-        @result: request / item / callback / None (返回值必须可迭代),正常处理为 None 即可
-        """
-        list_item = request.item
-        res = None
-        for i in range(5):
-            try:
-                parameter = request.parameter
-                res = UploadOSS().get_state(request.info,**parameter)
-            except:
-                log.error(traceback.format_exc())
-            if res is not None:
-                list_item.projectinfo = res
-                yield list_item
-                log.info(f"{res.get('filename')}附件下载完成,大小为:{res.get('size')},fid为:{res.get('fid')}")
-                return
-            else:
-                log.error(f"{res.get('filename')}附件下载失败,失败连接为:{res.get('org_url')}")
-        if res is None:
-            _db = RedisDB()
-            request_dict = request.to_dict
-            _db.zadd("forwork:files_failed", request_dict)
 
 
     def start_callback(self):
     def start_callback(self):
         """
         """

+ 42 - 25
FworkSpider/feapder/core/collector.py

@@ -7,10 +7,9 @@ Created on 2016-12-23 11:24
 @author: Boris
 @author: Boris
 @email: boris_liu@foxmail.com
 @email: boris_liu@foxmail.com
 """
 """
-
-import collections
 import threading
 import threading
 import time
 import time
+from queue import Queue, Empty
 
 
 import feapder.setting as setting
 import feapder.setting as setting
 import feapder.utils.tools as tools
 import feapder.utils.tools as tools
@@ -20,6 +19,7 @@ from feapder.utils.log import log
 
 
 
 
 class Collector(threading.Thread):
 class Collector(threading.Thread):
+
     def __init__(self, redis_key):
     def __init__(self, redis_key):
         """
         """
         @summary:
         @summary:
@@ -34,9 +34,9 @@ class Collector(threading.Thread):
 
 
         self._thread_stop = False
         self._thread_stop = False
 
 
-        self._todo_requests = collections.deque()
+        self._todo_requests = Queue(maxsize=setting.COLLECTOR_TASK_COUNT)
 
 
-        self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key)
+        self._tab_requests = setting.TAB_REQUESTS.format(redis_key=redis_key)
         self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
         self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
 
 
         self._spider_mark = tools.get_localhost_ip() + f"-{time.time()}"
         self._spider_mark = tools.get_localhost_ip() + f"-{time.time()}"
@@ -52,7 +52,7 @@ class Collector(threading.Thread):
         self._thread_stop = False
         self._thread_stop = False
         while not self._thread_stop:
         while not self._thread_stop:
             try:
             try:
-                self.__report_node_heartbeat() # step 汇报节点心跳
+                self.__report_node_heartbeat()  # step 汇报节点心跳
                 self.__input_data()
                 self.__input_data()
             except Exception as e:
             except Exception as e:
                 log.exception(e)
                 log.exception(e)
@@ -66,25 +66,29 @@ class Collector(threading.Thread):
         self._started.clear()
         self._started.clear()
 
 
     def __input_data(self):
     def __input_data(self):
-        current_timestamp = tools.get_current_timestamp()
-        if len(self._todo_requests) >= self._request_count: # step 待执行任务数量>设置的任务数量上限 不处理
+        if self._request_count / setting.SPIDER_THREAD_COUNT > 1 and (
+            self._todo_requests.qsize() > setting.SPIDER_THREAD_COUNT
+            or self._todo_requests.qsize() >= self._todo_requests.maxsize
+        ):  # 当任务总数大于线程数 且 内存队列持有任务总数大于线程数 此时不添加任务
+            time.sleep(0.1)
             return
             return
 
 
+        current_timestamp = tools.get_current_timestamp()
+
         request_count = self._request_count  # 先赋值
         request_count = self._request_count  # 先赋值
-        # step 查询最近有心跳的节点数量
+        # 查询最近有心跳的节点数量
         spider_count = self._db.zget_count(
         spider_count = self._db.zget_count(
             self._tab_spider_status,
             self._tab_spider_status,
             priority_min=current_timestamp - (self._interval + 10),
             priority_min=current_timestamp - (self._interval + 10),
             priority_max=current_timestamp,
             priority_max=current_timestamp,
         )
         )
-        # step 根据等待节点数量,动态分配request
+        # 根据等待节点数量,动态分配request
         if spider_count:
         if spider_count:
             # 任务数量
             # 任务数量
             task_count = self._db.zget_count(self._tab_requests)
             task_count = self._db.zget_count(self._tab_requests)
             # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1
             # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1
             request_count = task_count // spider_count + 1
             request_count = task_count // spider_count + 1
 
 
-        # step 判断 request_count 数量是否大于 设置的上限 ,大于上限,重置
         request_count = (
         request_count = (
             request_count
             request_count
             if request_count <= self._request_count
             if request_count <= self._request_count
@@ -108,7 +112,7 @@ class Collector(threading.Thread):
             if lose_count:
             if lose_count:
                 log.info("重置丢失任务完毕,共{}条".format(len(datas)))
                 log.info("重置丢失任务完毕,共{}条".format(len(datas)))
 
 
-        # 取任务,只取当前时间搓以内的任务,同时将任务分数修改为 current_timestamp + setting.REQUEST_LOST_TIMEOUT
+        # 取任务,只取当前时间搓以内的任务,同时将取走的任务分数修改为 current_timestamp + setting.REQUEST_LOST_TIMEOUT
         requests_list = self._db.zrangebyscore_set_score(
         requests_list = self._db.zrangebyscore_set_score(
             self._tab_requests,
             self._tab_requests,
             priority_min="-inf",
             priority_min="-inf",
@@ -117,10 +121,14 @@ class Collector(threading.Thread):
             count=request_count,
             count=request_count,
         )
         )
 
 
+        log.debug("领取新任务完毕,共{}条".format(len(requests_list)))
+
         if requests_list:
         if requests_list:
             self._is_collector_task = True
             self._is_collector_task = True
             # 存request
             # 存request
             self.__put_requests(requests_list)
             self.__put_requests(requests_list)
+        else:
+            time.sleep(0.1)
 
 
     def __report_node_heartbeat(self):
     def __report_node_heartbeat(self):
         """
         """
@@ -150,28 +158,37 @@ class Collector(threading.Thread):
             except Exception as e:
             except Exception as e:
                 log.exception(
                 log.exception(
                     """
                     """
-                error %s
-                request %s
-                """
+                    error %s
+                    request %s
+                    """
                     % (e, request)
                     % (e, request)
                 )
                 )
-
                 request_dict = None
                 request_dict = None
 
 
             if request_dict:
             if request_dict:
-                self._todo_requests.append(request_dict)
-
-    def get_requests(self, count):
-        requests = []
-        count = count if count <= len(self._todo_requests) else len(self._todo_requests)
-        while count:
-            requests.append(self._todo_requests.popleft())
-            count -= 1
+                self._todo_requests.put(request_dict)
 
 
-        return requests
+    def get_request(self):
+        try:
+            request = self._todo_requests.get(timeout=1)
+            return request
+        except Empty as e:
+            return None
 
 
     def get_requests_count(self):
     def get_requests_count(self):
-        return len(self._todo_requests) or self._db.zget_count(self._tab_requests) or 0
+        return (
+            self._todo_requests.qsize() or self._db.zget_count(self._tab_requests) or 0
+        )
 
 
     def is_collector_task(self):
     def is_collector_task(self):
         return self._is_collector_task
         return self._is_collector_task
+
+    def get_spider_count(self):
+        return self._db.zget_count(
+            self._tab_spider_status,
+            priority_min=tools.get_current_timestamp() - (self._interval + 10),
+            priority_max=tools.get_current_timestamp(),
+        )
+
+    def delete_spider_node(self):
+        self._db.zrem(self._tab_spider_status, self._spider_mark)

+ 1 - 1
FworkSpider/feapder/core/handle_failed_requests.py

@@ -24,7 +24,7 @@ class HandleFailedRequests(object):
         self._redisdb = RedisDB()
         self._redisdb = RedisDB()
         self._request_buffer = RequestBuffer(self._redis_key)
         self._request_buffer = RequestBuffer(self._redis_key)
 
 
-        self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
+        self._table_failed_request = setting.TAB_FAILED_REQUESTS.format(
             redis_key=redis_key
             redis_key=redis_key
         )
         )
 
 

File diff suppressed because it is too large
+ 499 - 453
FworkSpider/feapder/core/parser_control.py


+ 109 - 97
FworkSpider/feapder/core/scheduler.py

@@ -13,29 +13,34 @@ import threading
 import time
 import time
 from collections import Iterable
 from collections import Iterable
 
 
-
 import feapder.setting as setting
 import feapder.setting as setting
 import feapder.utils.tools as tools
 import feapder.utils.tools as tools
 from feapder.buffer.item_buffer import ItemBuffer
 from feapder.buffer.item_buffer import ItemBuffer
 from feapder.buffer.request_buffer import RequestBuffer
 from feapder.buffer.request_buffer import RequestBuffer
 from feapder.core.base_parser import BaseParser
 from feapder.core.base_parser import BaseParser
 from feapder.core.collector import Collector
 from feapder.core.collector import Collector
+from feapder.core.handle_failed_items import HandleFailedItems
 from feapder.core.handle_failed_requests import HandleFailedRequests
 from feapder.core.handle_failed_requests import HandleFailedRequests
 from feapder.core.parser_control import PaserControl
 from feapder.core.parser_control import PaserControl
 from feapder.db.redisdb import RedisDB
 from feapder.db.redisdb import RedisDB
 from feapder.network.item import Item
 from feapder.network.item import Item
 from feapder.network.request import Request
 from feapder.network.request import Request
+from feapder.utils import metrics
 from feapder.utils.log import log
 from feapder.utils.log import log
 from feapder.utils.redis_lock import RedisLock
 from feapder.utils.redis_lock import RedisLock
-from feapder.utils import metrics
 
 
-SPIDER_START_TIME_KEY = "spider_start_time"
+SPIDER_UUID = tools.get_uuid()
+SPIDER_START_TIME = "spider_start_time"
+SPIDER_START_TIME_KEY = SPIDER_START_TIME + "#" + SPIDER_UUID
 SPIDER_END_TIME_KEY = "spider_end_time"
 SPIDER_END_TIME_KEY = "spider_end_time"
 SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY = "last_task_count_record_time"
 SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY = "last_task_count_record_time"
 
 
+
 class Obj(object):
 class Obj(object):
     def __init__(self, dict_):
     def __init__(self, dict_):
         self.__dict__.update(dict_)
         self.__dict__.update(dict_)
+
+
 class Scheduler(threading.Thread):
 class Scheduler(threading.Thread):
     __custom_setting__ = {}
     __custom_setting__ = {}
 
 
@@ -78,7 +83,15 @@ class Scheduler(threading.Thread):
             else:
             else:
                 setattr(setting, key, value)
                 setattr(setting, key, value)
         
         
-
+        # 历史爬虫[redis_key]
+        for item in sys.argv[1:]:
+            if item.startswith("--purpose"):
+                val = item.split('=')[-1]
+                if not redis_key.endswith(val):
+                    # 历史爬虫需要单独的redis_key,防止增量爬虫
+                    # 与历史爬虫共用同一个redis_key,出现增量爬虫断点续采的情况
+                    redis_key += f'_{val}'
+                    
         self._redis_key = redis_key or setting.REDIS_KEY
         self._redis_key = redis_key or setting.REDIS_KEY
         if not self._redis_key:
         if not self._redis_key:
             raise Exception(
             raise Exception(
@@ -129,11 +142,12 @@ class Scheduler(threading.Thread):
 
 
         self._spider_name = redis_key
         self._spider_name = redis_key
         self._project_name = redis_key.split(":")[0]
         self._project_name = redis_key.split(":")[0]
+        self._task_table = task_table
 
 
         self._tab_spider_time = setting.TAB_SPIDER_TIME.format(redis_key=redis_key)
         self._tab_spider_time = setting.TAB_SPIDER_TIME.format(redis_key=redis_key)
         self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
         self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
-        self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key)
-        self._tab_failed_requests = setting.TAB_FAILED_REQUSETS.format(
+        self._tab_requests = setting.TAB_REQUESTS.format(redis_key=redis_key)
+        self._tab_failed_requests = setting.TAB_FAILED_REQUESTS.format(
             redis_key=redis_key
             redis_key=redis_key
         )
         )
 
 
@@ -171,23 +185,16 @@ class Scheduler(threading.Thread):
             raise ValueError("类型错误,爬虫需继承feapder.BaseParser或feapder.BatchParser")
             raise ValueError("类型错误,爬虫需继承feapder.BaseParser或feapder.BatchParser")
 
 
     def run(self):  # STEP 1 爬虫框架入口
     def run(self):  # STEP 1 爬虫框架入口
-        if not self.is_reach_next_spider_time(): # STEP 2 检测爬虫是否到达执行时间
+        if not self.is_reach_next_spider_time():  # STEP 2 检测爬虫是否到达执行时间
             return
             return
 
 
-        self._start() # STEP 3 开始运行爬虫
+        self._start()  # STEP 3 开始运行爬虫
 
 
-        while True: # step 4 对爬虫状态的一个监控
+        while True:  # step 4 对爬虫状态的一个监控
             try:
             try:
                 if self.all_thread_is_done(): # Step 5 判断爬虫是否运行完成
                 if self.all_thread_is_done(): # Step 5 判断爬虫是否运行完成
                     if not self._is_notify_end:
                     if not self._is_notify_end:
                         self.spider_end()  # 跑完一轮
                         self.spider_end()  # 跑完一轮
-                        self.record_spider_state(  # step 6 应该是一个通知爬虫结束的方法
-                            spider_type=1,
-                            state=1,
-                            spider_end_time=tools.get_current_date(),
-                            batch_interval=self._batch_interval,
-                        )
-
                         self._is_notify_end = True
                         self._is_notify_end = True
 
 
                     if not self._keep_alive: # step 7 如果不是常驻爬虫 停止所有线程
                     if not self._keep_alive: # step 7 如果不是常驻爬虫 停止所有线程
@@ -197,7 +204,7 @@ class Scheduler(threading.Thread):
                 else:
                 else:
                     self._is_notify_end = False
                     self._is_notify_end = False
 
 
-                self.check_task_status() # step 8 检查任务状态,并进行告警通知
+                self.check_task_status()  # step 8 检查任务状态,并进行告警通知
 
 
             except Exception as e:
             except Exception as e:
                 log.exception(e)
                 log.exception(e)
@@ -207,15 +214,8 @@ class Scheduler(threading.Thread):
     def __add_task(self):
     def __add_task(self):
         # 启动parser 的 start_requests
         # 启动parser 的 start_requests
         self.spider_begin()  # 不自动结束的爬虫此处只能执行一遍
         self.spider_begin()  # 不自动结束的爬虫此处只能执行一遍
-        self.record_spider_state(
-            spider_type=1,
-            state=0,
-            batch_date=tools.get_current_date(),
-            spider_start_time=tools.get_current_date(),
-            batch_interval=self._batch_interval,
-        )
 
 
-        # 判断任务池中属否还有任务,若有接着抓取
+        # 判断任务池中属否还有任务,若有接着抓取,若无则生产新任务
         todo_task_count = self._collector.get_requests_count()
         todo_task_count = self._collector.get_requests_count()
         if todo_task_count:
         if todo_task_count:
             log.info("检查到有待做任务 %s 条,不重下发新任务,将接着上回异常终止处继续抓取" % todo_task_count)
             log.info("检查到有待做任务 %s 条,不重下发新任务,将接着上回异常终止处继续抓取" % todo_task_count)
@@ -227,17 +227,17 @@ class Scheduler(threading.Thread):
                     raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
                     raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
 
 
                 result_type = 1
                 result_type = 1
-                for result in results or []: # step 对yield 的数据进行判断处理
-                    if isinstance(result, Request): # Request 加入到任务队列
+                for result in results or []:  # step 对yield 的数据进行判断处理
+                    if isinstance(result, Request):  # Request 加入到任务队列
                         result.parser_name = result.parser_name or parser.name
                         result.parser_name = result.parser_name or parser.name
                         self._request_buffer.put_request(result)
                         self._request_buffer.put_request(result)
                         result_type = 1
                         result_type = 1
 
 
-                    elif isinstance(result, Item): # Item 数据,存入到数据管道队列,等待存储
+                    elif isinstance(result, Item):  # Item 数据,存入到数据管道队列,等待存储
                         self._item_buffer.put_item(result)
                         self._item_buffer.put_item(result)
                         result_type = 2
                         result_type = 2
 
 
-                    elif callable(result):  # callbale的request可能是更新数据库操作的函数
+                    elif callable(result):  # callable  request 可能是更新数据库操作的函数
                         if result_type == 1:
                         if result_type == 1:
                             self._request_buffer.put_request(result)
                             self._request_buffer.put_request(result)
                         else:
                         else:
@@ -253,12 +253,21 @@ class Scheduler(threading.Thread):
                 self._item_buffer.flush()
                 self._item_buffer.flush()
 
 
     def _start(self):
     def _start(self):
+        # 将失败的item入库
+        if setting.RETRY_FAILED_ITEMS:
+            handle_failed_items = HandleFailedItems(
+                redis_key=self._redis_key,
+                task_table=self._task_table,
+                item_buffer=self._item_buffer,
+            )
+            handle_failed_items.reput_failed_items_to_db()
 
 
-        self._request_buffer.start()  # STEP 3.1 启动request_buffer -- 任务管理器, 负责缓冲添加到数据库中的request
-
-        self._item_buffer.start()  # STEP 3.2 启动item_buffer -- 管道管理器 责缓冲添加到数据库中的item, 由该manager统一添加。防止多线程同时访问数据库
-
-        self._collector.start()  # STEP 3.3 启动collector  -- 任务管理 ,根据节点和任务,平均分配给每个节点
+        # STEP 3.1 启动request_buffer -- 任务管理器, 负责缓冲添加到数据库中的request
+        self._request_buffer.start()
+        # STEP 3.2 启动item_buffer -- 管道管理器 责缓冲添加到数据库中的item, 由该manager统一添加。防止多线程同时访问数据库
+        self._item_buffer.start()
+        # STEP 3.3 启动collector  -- 任务管理 ,根据节点和任务,平均分配给每个节点
+        self._collector.start()
 
 
         # 启动parser control
         # 启动parser control
         for i in range(self._thread_count):
         for i in range(self._thread_count):
@@ -293,7 +302,8 @@ class Scheduler(threading.Thread):
                 self.__add_task()
                 self.__add_task()
 
 
     def all_thread_is_done(self):
     def all_thread_is_done(self):
-        for i in range(3):  # Stress 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
+        # Stress 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
+        for i in range(3):
             # STEP 5.1 检测 collector 状态
             # STEP 5.1 检测 collector 状态
             if (
             if (
                 self._collector.is_collector_task()
                 self._collector.is_collector_task()
@@ -320,7 +330,7 @@ class Scheduler(threading.Thread):
             ):
             ):
                 return False
                 return False
 
 
-            tools.delay_time(1) # 休眠一分钟
+            tools.delay_time(1)  # 休眠1秒
 
 
         return True
         return True
 
 
@@ -336,6 +346,40 @@ class Scheduler(threading.Thread):
         else:
         else:
             return
             return
 
 
+        # 检查失败任务数量 超过1000 报警,
+        failed_count = self._redisdb.zget_count(self._tab_failed_requests)
+        print('<<<<<<<<<<<<<<<<<<<<<<<<<<<< 失败次数:', failed_count)
+        if failed_count > setting.WARNING_FAILED_COUNT:
+            # 发送报警
+            msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count)
+            log.error(msg)
+            self.send_msg(
+                msg,
+                level="error",
+                message_prefix="《%s》爬虫当前失败任务数报警" % (self._spider_name),
+            )
+
+        # parser_control实时统计已做任务数及失败任务数,若成功率<0.5 则报警
+        failed_task_count, success_task_count = PaserControl.get_task_status_count()
+        total_count = success_task_count + failed_task_count
+        if total_count > 0:
+            task_success_rate = success_task_count / total_count
+            if task_success_rate < 0.5:
+                # 发送报警
+                msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % (
+                    self._spider_name,
+                    success_task_count,
+                    failed_task_count,
+                    task_success_rate,
+                )
+                log.error(msg)
+                self.send_msg(
+                    msg,
+                    level="error",
+                    message_prefix="《%s》爬虫当前任务成功率报警" % (self._spider_name),
+                )
+
+        # 判断任务数是否变化
         # step 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息
         # step 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息
         task_count = self._redisdb.zget_count(self._tab_requests)
         task_count = self._redisdb.zget_count(self._tab_requests)
 
 
@@ -346,7 +390,7 @@ class Scheduler(threading.Thread):
                     self._tab_spider_time,
                     self._tab_spider_time,
                     SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
                     SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
                     tools.get_current_timestamp(),
                     tools.get_current_timestamp(),
-                )  # 多进程会重复发消息, 使用reids记录上次统计时间
+                )  # 多进程会重复发消息, 使用redis记录上次统计时间
             else:
             else:
                 # step 判断时间间隔是否超过20分钟
                 # step 判断时间间隔是否超过20分钟
                 lua = """
                 lua = """
@@ -357,7 +401,8 @@ class Scheduler(threading.Thread):
                     -- 取值
                     -- 取值
                     local last_timestamp = redis.call('hget', KEYS[1], field)
                     local last_timestamp = redis.call('hget', KEYS[1], field)
                     if last_timestamp and current_timestamp - last_timestamp >= 1200 then
                     if last_timestamp and current_timestamp - last_timestamp >= 1200 then
-                        return current_timestamp - last_timestamp -- 返回任务停滞时间 秒
+                        -- 返回任务停滞时间 秒
+                        return current_timestamp - last_timestamp 
                     end
                     end
 
 
                     if not last_timestamp then
                     if not last_timestamp then
@@ -382,49 +427,15 @@ class Scheduler(threading.Thread):
                     msg = "{}  爬虫任务停滞 {},请检查爬虫是否正常".format(
                     msg = "{}  爬虫任务停滞 {},请检查爬虫是否正常".format(
                         self._spider_name, tools.format_seconds(overtime)
                         self._spider_name, tools.format_seconds(overtime)
                     )
                     )
-                    log.error(msg)  # TODO 这一步可以加一个print,在平台的日志框里输出
+                    log.error(msg) # TODO 这一步可以加一个print,在平台的日志框里输出
                     self.send_msg(
                     self.send_msg(
                         msg,
                         msg,
                         level="error",
                         level="error",
                         message_prefix="《{}》爬虫任务停滞".format(self._spider_name),
                         message_prefix="《{}》爬虫任务停滞".format(self._spider_name),
                     )
                     )
-
         else:
         else:
             self._last_task_count = 0
             self._last_task_count = 0
 
 
-        # 检查失败任务数量 超过1000 报警,
-        failed_count = self._redisdb.zget_count(self._tab_failed_requests)
-        print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<失败次数:',failed_count)
-        if failed_count > setting.WARNING_FAILED_COUNT:
-            # 发送报警
-            msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count)
-            log.error(msg)
-            self.send_msg(
-                msg,
-                level="error",
-                message_prefix="《%s》爬虫当前失败任务数报警" % (self._spider_name),
-            )
-
-        # parser_control实时统计已做任务数及失败任务数,若成功率<0.5 则报警
-        failed_task_count, success_task_count = PaserControl.get_task_status_count()
-        total_count = success_task_count + failed_task_count
-        if total_count > 0:
-            task_success_rate = success_task_count / total_count
-            if task_success_rate < 0.5:
-                # 发送报警
-                msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % (
-                    self._spider_name,
-                    success_task_count,
-                    failed_task_count,
-                    task_success_rate,
-                )
-                log.error(msg)
-                self.send_msg(
-                    msg,
-                    level="error",
-                    message_prefix="《%s》爬虫当前任务成功率报警" % (self._spider_name),
-                )
-
         # 检查入库失败次数
         # 检查入库失败次数
         if self._item_buffer.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
         if self._item_buffer.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
             msg = "《{}》爬虫导出数据失败,失败次数:{}, 请检查爬虫是否正常".format(
             msg = "《{}》爬虫导出数据失败,失败次数:{}, 请检查爬虫是否正常".format(
@@ -450,6 +461,11 @@ class Scheduler(threading.Thread):
                 if table != self._tab_spider_time:
                 if table != self._tab_spider_time:
                     log.info("正在删除key %s" % table)
                     log.info("正在删除key %s" % table)
                     redis.clear(table)
                     redis.clear(table)
+                else:
+                    keys = redis.hgetall(table)
+                    for key in keys:
+                        if key.startswith(SPIDER_START_TIME):
+                            redis.hdel(table, key)
 
 
     def _stop_all_thread(self):
     def _stop_all_thread(self):
         self._request_buffer.stop()
         self._request_buffer.stop()
@@ -472,9 +488,12 @@ class Scheduler(threading.Thread):
     def get_argvs(self):
     def get_argvs(self):
         argvs = {"next_page": False, "max_page": 10}
         argvs = {"next_page": False, "max_page": 10}
         for item in sys.argv[1:]:
         for item in sys.argv[1:]:
-            print(item)
+            # print(item)
             if item.startswith("--"):
             if item.startswith("--"):
-                argvs[item.replace("--", "").split('=')[0]] = eval(item.split('=')[-1]) # 此处使用eval的原因是字符串转bool或int
+                key = item.replace("--", "").split('=')[0]
+                val = item.split('=')[-1]
+                if key != 'purpose':
+                    argvs[key] = eval(val)  # 此处使用eval的原因是字符串转bool或int
         return json.loads(json.dumps(argvs), object_hook=Obj)
         return json.loads(json.dumps(argvs), object_hook=Obj)
 
 
     def spider_begin(self):
     def spider_begin(self):
@@ -489,8 +508,9 @@ class Scheduler(threading.Thread):
             self._begin_callback()
             self._begin_callback()
 
 
         for parser in self._parsers:
         for parser in self._parsers:
-            parser.platform_next_page = self.get_argvs().next_page
-            parser.platform_max_page = self.get_argvs().max_page
+            parameter = self.get_argvs()
+            parser.platform_next_page = parameter.next_page
+            parser.platform_max_page = parameter.max_page
             parser.start_callback()
             parser.start_callback()
 
 
         # 记录开始时间
         # 记录开始时间
@@ -503,7 +523,7 @@ class Scheduler(threading.Thread):
             # 发送消息
             # 发送消息
             # self.send_msg("《%s》爬虫开始" % self._spider_name)
             # self.send_msg("《%s》爬虫开始" % self._spider_name)
 
 
-    def spider_end(self): # step end 爬虫结束时的一些操作
+    def spider_end(self):  # step end 爬虫结束时的一些操作
         self.record_end_time()
         self.record_end_time()
 
 
         if self._end_callback:  # 系统自带的回调,如果自定义回调,则这个回调不会执行
         if self._end_callback:  # 系统自带的回调,如果自定义回调,则这个回调不会执行
@@ -511,8 +531,8 @@ class Scheduler(threading.Thread):
 
 
         for parser in self._parsers:
         for parser in self._parsers:
             if not self._keep_alive:
             if not self._keep_alive:
-                parser.close() # 爬虫可自定义close
-            parser.end_callback() # 调用结束回调函数,可在爬虫自定义
+                parser.close()  # 爬虫可自定义close
+            parser.end_callback()  # 调用结束回调函数,可在爬虫自定义
 
 
         if not self._keep_alive:
         if not self._keep_alive:
             # 关闭webdirver
             # 关闭webdirver
@@ -530,21 +550,24 @@ class Scheduler(threading.Thread):
         )
         )
         if data:
         if data:
             begin_timestamp = int(data)
             begin_timestamp = int(data)
-
-            spand_time = tools.get_current_timestamp() - begin_timestamp
-
+            elapsed_time = tools.get_current_timestamp() - begin_timestamp
             msg = "《%s》爬虫结束,耗时 %s" % (
             msg = "《%s》爬虫结束,耗时 %s" % (
                 self._spider_name,
                 self._spider_name,
-                tools.format_seconds(spand_time),
+                tools.format_seconds(elapsed_time),
             )
             )
             log.info(msg)
             log.info(msg)
 
 
             # self.send_msg(msg)
             # self.send_msg(msg)
 
 
         if self._keep_alive:
         if self._keep_alive:
-            log.info("爬虫不自动结束, 等待下一轮任务...")
+            log.info("爬虫不自动结束,等待下一轮任务...")
         else:
         else:
-            self.delete_tables(self._tab_spider_status)
+            if self._collector.get_spider_count() <= 1:
+                self.delete_tables(self._tab_spider_time)
+                self.delete_tables(self._tab_spider_status)
+            else:
+                # 清除关闭爬虫的心跳记录,防止删除任务共享表,造成爬虫异常僵死
+                self._collector.delete_spider_node()
 
 
     def record_end_time(self):
     def record_end_time(self):
         # 记录结束时间
         # 记录结束时间
@@ -578,17 +601,6 @@ class Scheduler(threading.Thread):
 
 
         return True
         return True
 
 
-    def record_spider_state(
-        self,
-        spider_type,
-        state,
-        batch_date=None,
-        spider_start_time=None,
-        spider_end_time=None,
-        batch_interval=None,
-    ):
-        pass
-
     def join(self, timeout=None):
     def join(self, timeout=None):
         """
         """
         重写线程的join
         重写线程的join

+ 16 - 2
FworkSpider/feapder/core/spiders/__init__.py

@@ -8,8 +8,22 @@ Created on 2020/4/22 12:08 AM
 @email: boris_liu@foxmail.com
 @email: boris_liu@foxmail.com
 """
 """
 
 
-__all__ = ["AirSpider", "Spider", "BatchSpider"]
+__all__ = [
+    "AirSpider",
+    "Spider",
+    "BatchSpider",
+    "BiddingListSpider",
+    "BiddingDetailSpider",
+    "PlanToBuildListSpider",
+    "PlanToBuildDetailSpider",
+]
 
 
 from feapder.core.spiders.air_spider import AirSpider
 from feapder.core.spiders.air_spider import AirSpider
-from feapder.core.spiders.spider import Spider
 from feapder.core.spiders.batch_spider import BatchSpider
 from feapder.core.spiders.batch_spider import BatchSpider
+from feapder.core.spiders.spider import (
+    Spider,
+    BiddingListSpider,
+    BiddingDetailSpider,
+    PlanToBuildListSpider,
+    PlanToBuildDetailSpider
+)

+ 3 - 18
FworkSpider/feapder/core/spiders/air_spider.py

@@ -126,11 +126,11 @@ class BatchSpider(BatchParser, Scheduler):
         self._check_task_interval = check_task_interval
         self._check_task_interval = check_task_interval
         self._task_limit = task_limit  # mysql中一次取的任务数量
         self._task_limit = task_limit  # mysql中一次取的任务数量
         self._related_task_tables = [
         self._related_task_tables = [
-            setting.TAB_REQUSETS.format(redis_key=redis_key)
+            setting.TAB_REQUESTS.format(redis_key=redis_key)
         ]  # 自己的task表也需要检查是否有任务
         ]  # 自己的task表也需要检查是否有任务
         if related_redis_key:
         if related_redis_key:
             self._related_task_tables.append(
             self._related_task_tables.append(
-                setting.TAB_REQUSETS.format(redis_key=related_redis_key)
+                setting.TAB_REQUESTS.format(redis_key=related_redis_key)
             )
             )
 
 
         self._related_batch_record = related_batch_record
         self._related_batch_record = related_batch_record
@@ -216,7 +216,7 @@ class BatchSpider(BatchParser, Scheduler):
                 is_first_check = False
                 is_first_check = False
 
 
                 # 检查redis中是否有任务 任务小于_min_task_count 则从mysql中取
                 # 检查redis中是否有任务 任务小于_min_task_count 则从mysql中取
-                tab_requests = setting.TAB_REQUSETS.format(redis_key=self._redis_key)
+                tab_requests = setting.TAB_REQUESTS.format(redis_key=self._redis_key)
                 todo_task_count = self._redisdb.zget_count(tab_requests)
                 todo_task_count = self._redisdb.zget_count(tab_requests)
 
 
                 tasks = []
                 tasks = []
@@ -922,13 +922,6 @@ class BatchSpider(BatchParser, Scheduler):
 
 
             # 爬虫开始
             # 爬虫开始
             self.spider_begin()
             self.spider_begin()
-            self.record_spider_state(
-                spider_type=2,
-                state=0,
-                batch_date=batch_date,
-                spider_start_time=tools.get_current_date(),
-                batch_interval=self._batch_interval,
-            )
         else:
         else:
             log.error("插入新批次失败")
             log.error("插入新批次失败")
 
 
@@ -1028,14 +1021,6 @@ class BatchSpider(BatchParser, Scheduler):
                     ):  # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况)
                     ):  # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况)
                         if not self._is_notify_end:
                         if not self._is_notify_end:
                             self.spider_end()
                             self.spider_end()
-                            self.record_spider_state(
-                                spider_type=2,
-                                state=1,
-                                batch_date=self._batch_date_cache,
-                                spider_end_time=tools.get_current_date(),
-                                batch_interval=self._batch_interval,
-                            )
-
                             self._is_notify_end = True
                             self._is_notify_end = True
 
 
                         if not self._keep_alive:
                         if not self._keep_alive:

+ 149 - 16
FworkSpider/feapder/core/spiders/spider.py

@@ -16,6 +16,7 @@ import feapder.setting as setting
 import feapder.utils.tools as tools
 import feapder.utils.tools as tools
 from feapder.core.base_parser import BaseParser
 from feapder.core.base_parser import BaseParser
 from feapder.core.scheduler import Scheduler
 from feapder.core.scheduler import Scheduler
+from feapder.db.mongodb import MongoDB
 from feapder.db.redisdb import RedisDB
 from feapder.db.redisdb import RedisDB
 from feapder.network.item import Item
 from feapder.network.item import Item
 from feapder.network.request import Request
 from feapder.network.request import Request
@@ -96,7 +97,7 @@ class Spider(
         while True:
         while True:
             try:
             try:
                 # 检查redis中是否有任务
                 # 检查redis中是否有任务
-                tab_requests = setting.TAB_REQUSETS.format(redis_key=self._redis_key)
+                tab_requests = setting.TAB_REQUESTS.format(redis_key=self._redis_key)
                 todo_task_count = redisdb.zget_count(tab_requests)
                 todo_task_count = redisdb.zget_count(tab_requests)
 
 
                 if todo_task_count < self._min_task_count:  # 添加任务
                 if todo_task_count < self._min_task_count:  # 添加任务
@@ -160,14 +161,6 @@ class Spider(
         if self._is_distributed_task:  # 有任务时才提示启动爬虫
         if self._is_distributed_task:  # 有任务时才提示启动爬虫
             # begin
             # begin
             self.spider_begin()
             self.spider_begin()
-            self.record_spider_state(
-                spider_type=1,
-                state=0,
-                batch_date=tools.get_current_date(),
-                spider_start_time=tools.get_current_date(),
-                batch_interval=self._batch_interval,
-            )
-
             # 重置已经提示无任务状态为False
             # 重置已经提示无任务状态为False
             self._is_show_not_task = False
             self._is_show_not_task = False
 
 
@@ -194,13 +187,6 @@ class Spider(
                 if self.all_thread_is_done():
                 if self.all_thread_is_done():
                     if not self._is_notify_end:
                     if not self._is_notify_end:
                         self.spider_end()  # 跑完一轮
                         self.spider_end()  # 跑完一轮
-                        self.record_spider_state(
-                            spider_type=1,
-                            state=1,
-                            spider_end_time=tools.get_current_date(),
-                            batch_interval=self._batch_interval,
-                        )
-
                         self._is_notify_end = True
                         self._is_notify_end = True
 
 
                     if not self._keep_alive:
                     if not self._keep_alive:
@@ -435,3 +421,150 @@ class DebugSpider(Spider):
             tools.delay_time(1)  # 1秒钟检查一次爬虫状态
             tools.delay_time(1)  # 1秒钟检查一次爬虫状态
 
 
         self.delete_tables([self._redis_key + "*"])
         self.delete_tables([self._redis_key + "*"])
+
+
+class BusinessBaseListSpider(Spider):
+    """列表页爬虫事务基类"""
+
+    __business_type__ = "List"
+
+    def _increment_page_number(self, request):
+        """无限翻页 - 页码自增"""
+        if self.platform_next_page:
+            if getattr(request, 'real_page', None) is None:
+                request.real_page = 0  # real_page=连续翻页页码(真实入库数量=0)
+
+            request.real_page += 1
+
+            if request.rel_count > 0:
+                request.real_page = 0  # 当真实入库数量大于0,重置翻页记录
+                request.rel_count = 0  # 重置实际入库数量
+
+            if request.real_page <= 5 and request.page < self.platform_max_page:
+                request.page += 1
+                # 设置无限翻页回调方法,进行列表页解析处理
+                callback_parser = (
+                    request.callback
+                    if callable(request.callback)
+                    else self.parse
+                )
+                request.callback = callback_parser
+                yield request
+        else:
+            if request.page < int(request.item["crawl_page"]):
+                request.page += 1  # 采集页码自增
+                request.rel_count = 0  # 重置实际入库数量
+                # 设置无限翻页回调方法,进行列表页解析处理
+                callback_parser = (
+                    request.callback
+                    if callable(request.callback)
+                    else self.parse
+                )
+                request.callback = callback_parser
+                yield request
+
+    def infinite_pages(self, request, response):
+        """无限翻页"""
+        request_generator = self._increment_page_number(request)
+        try:
+            request = next(request_generator)
+            return request
+        except StopIteration:
+            pass
+
+
+class BusinessBaseDetailSpider(Spider):
+    """详情页爬虫事务基类"""
+
+    __business_type__ = "Detail"
+
+    __custom_setting__ = dict(
+        ITEM_FILTER_ENABLE=False
+    )
+
+    err_coll_name = "listdata_err"
+    _to_db = None
+
+    def get_tasks(self, query, limit=10, is_delete=True, **kwargs):
+        """
+        领取采集任务
+
+        :param dict query: 查询条件
+        :param limit: 结果数量
+        :param is_delete: 取走的任务是否删除
+        :param kwargs
+            更多参数 https://docs.mongodb.com/manual/reference/command/find/#command-fields
+
+        :return: dict
+        """
+        if "sort" not in kwargs:
+            kwargs.setdefault("sort", {"_id": -1})
+
+        cursor = self.to_db.find(coll_name=self.db_name, condition=query, limit=limit, **kwargs)
+        for task in cursor:
+            yield task
+
+            if is_delete:
+                self.to_db.delete(coll_name=self.db_name, condition=task)
+
+    @property
+    def to_db(self):
+        if not self._to_db:
+            self._to_db = MongoDB()
+        return self._to_db
+
+
+class BiddingListSpider(BusinessBaseListSpider):
+    """标讯列表页爬虫事务类"""
+
+    __business_type__ = "BiddingList"
+
+    pass
+
+
+class BiddingDetailSpider(BusinessBaseDetailSpider):
+    """标讯详情页爬虫事务类"""
+
+    __business_type__ = "BiddingDetail"
+    db_name = "mgp_list"
+
+    def failed_request(self, request, response):
+        """请求、解析错误次数超过上限后,将原信息重新保存至数据库,并修改failed字段"""
+        _data = request.base_info if isinstance(request.base_info, dict) else request.base_info.to_dict
+        item = Item(origin_data=_data)
+        item.table_name = self.err_coll_name
+        item.status_code = getattr(response, "status_code", -1)
+        item.err_reason = getattr(request, "error_msg", "")
+        item.err_requests = int(getattr(item, "err_requests", 0)) + 1
+        item.create_at = tools.ensure_int64(tools.get_current_timestamp())
+        item.origin = self.db_name
+        item.spidercode = _data["spidercode"]
+        yield item
+
+
+class PlanToBuildListSpider(BusinessBaseListSpider):
+    """拟建列表页爬虫事务类"""
+
+    __business_type__ = "PlanToBuildList"
+
+    pass
+
+
+class PlanToBuildDetailSpider(BusinessBaseDetailSpider):
+    """拟建详情页爬虫事务类"""
+
+    __business_type__ = "PlanToBuildDetail"
+    db_name = "njpc_list"
+
+    def failed_request(self, request, response):
+        """请求、解析错误次数超过上限后,将原信息重新保存至数据库,并修改failed字段"""
+        _data = request.item if isinstance(request.item, dict) else request.item.to_dict
+        item = Item(origin_data=_data)
+        item.table_name = self.err_coll_name
+        item.status_code = getattr(response, "status_code", -1)
+        item.err_requests = int(getattr(item, "err_requests", 0)) + 1
+        item.err_reason = getattr(request, "error_msg", "")
+        item.create_at = tools.ensure_int64(tools.get_current_timestamp())
+        item.origin = self.db_name
+        item.spidercode = _data["spidercode"]
+        yield item

+ 1 - 5
FworkSpider/feapder/db/__init__.py

@@ -159,7 +159,6 @@ class MongoDB:
         try:
         try:
             collection.insert_one(data)
             collection.insert_one(data)
         except DuplicateKeyError as e:
         except DuplicateKeyError as e:
-            data.pop("_id", "")
             # 存在则更新
             # 存在则更新
             if update_columns:
             if update_columns:
                 if not isinstance(update_columns, (tuple, list)):
                 if not isinstance(update_columns, (tuple, list)):
@@ -236,7 +235,6 @@ class MongoDB:
                     # 数据重复
                     # 数据重复
                     # 获取重复的数据
                     # 获取重复的数据
                     data = error.get("op")
                     data = error.get("op")
-                    data.pop("_id", "")
 
 
                     def get_condition():
                     def get_condition():
                         # 获取更新条件
                         # 获取更新条件
@@ -265,9 +263,7 @@ class MongoDB:
                             }
                             }
                         else:
                         else:
                             # 使用数据本身的值更新
                             # 使用数据本身的值更新
-                            doc = {}
-                            for key in update_columns:
-                                doc = {key: data.get(key)}
+                            doc = {key: data.get(key) for key in update_columns}
 
 
                         collection.update_one(get_condition(), {"$set": doc})
                         collection.update_one(get_condition(), {"$set": doc})
                         add_count -= 1
                         add_count -= 1

+ 107 - 76
FworkSpider/feapder/db/mysqldb.py

@@ -2,48 +2,89 @@
 """
 """
 Created on 2018-12-13 21:08
 Created on 2018-12-13 21:08
 ---------
 ---------
-@summary:  sha256 redis集群去重,正式环境使用的去重方式
+@summary:
 ---------
 ---------
 @author: Boris
 @author: Boris
 @email: boris_liu@foxmail.com
 @email: boris_liu@foxmail.com
 """
 """
 
 
 import copy
 import copy
-from typing import Any, List, Union, Tuple, Callable
-import rediscluster
-from Crypto.Hash import SHA256
-from feapder import setting
+from typing import Any, List, Union, Tuple, Callable, Optional
+
+from feapder.utils.tools import get_md5
+from .bloomfilter import BloomFilter, ScalableBloomFilter
+from .expirefilter import ExpireFilter
+from .litefilter import LiteFilter
+from .swordfishfilter import SwordFishFilter
+
 
 
 class Dedup:
 class Dedup:
     BloomFilter = 1
     BloomFilter = 1
     MemoryFilter = 2
     MemoryFilter = 2
     ExpireFilter = 3
     ExpireFilter = 3
-    def __init__(self,ilter_type: int = BloomFilter):
-        self._to_sha256 = True
-        self._to_redis = None
+    LiteFilter = 4
+    SwordFishFilter = 5
+
+    def __init__(self, filter_type: int = BloomFilter, to_md5: bool = True, **kwargs):
+        if filter_type == Dedup.ExpireFilter:
+            try:
+                expire_time = kwargs["expire_time"]
+            except:
+                raise ValueError("需传参数 expire_time")
+
+            name = kwargs.get("absolute_name") or "dedup:expire_set:%s" % kwargs.get(
+                "name", expire_time
+            )
+            expire_time_record_key = "dedup:expire_set:expire_time"
+
+            self.dedup = ExpireFilter(
+                name=name,
+                expire_time=expire_time,
+                expire_time_record_key=expire_time_record_key,
+                redis_url=kwargs.get("redis_url"),
+            )
+        elif filter_type == Dedup.SwordFishFilter:
+            self.dedup = SwordFishFilter(
+                redis_url=kwargs.get("redis_url"),
+                expire_time=kwargs.get("expire_time")
+            )
+        else:
+            initial_capacity = kwargs.get("initial_capacity", 100000000)
+            error_rate = kwargs.get("error_rate", 0.00001)
+            name = kwargs.get("absolute_name") or "dedup:bloomfilter:" + kwargs.get("name", "bloomfilter")
+            if filter_type == Dedup.BloomFilter:
+                self.dedup = ScalableBloomFilter(
+                    name=name,
+                    initial_capacity=initial_capacity,
+                    error_rate=error_rate,
+                    bitarray_type=ScalableBloomFilter.BASE_REDIS,
+                    redis_url=kwargs.get("redis_url"),
+                )
+            elif filter_type == Dedup.MemoryFilter:
+                self.dedup = ScalableBloomFilter(
+                    name=name,
+                    initial_capacity=initial_capacity,
+                    error_rate=error_rate,
+                    bitarray_type=ScalableBloomFilter.BASE_MEMORY,
+                )
+            elif filter_type == Dedup.LiteFilter:
+                self.dedup = LiteFilter()
+            else:
+                raise ValueError(
+                    "filter_type 类型错误,仅支持 Dedup.BloomFilter、Dedup.MemoryFilter、Dedup.ExpireFilter"
+                )
 
 
-    @property
-    def redis_cluster(self): # 连接redis集群
-        if not self._to_redis:
-            startup_nodes = [{"host": i.get("host"), "port": i.get("port")} for i in setting.REDISCLUSTER]
-            self._to_redis =  rediscluster.RedisCluster(startup_nodes=startup_nodes, decode_responses=True)
-        return self._to_redis
+        self._to_md5 = to_md5
 
 
     def __repr__(self):
     def __repr__(self):
-        return 'sha256'
-    def sha256(self,info):
-        if info is None:
-            return ''
-        res = SHA256.new(info.encode('utf-8'))
-        data = res.hexdigest()
-        return data
-
-    def _deal_datas(self, datas): # 对datas进行加密处理
-        if self._to_sha256:
+        return str(self.dedup)
+
+    def _deal_datas(self, datas):
+        if self._to_md5:
             if isinstance(datas, list):
             if isinstance(datas, list):
-                keys = [self.sha256(data) for data in datas]
+                keys = [get_md5(data) for data in datas]
             else:
             else:
-                keys = self.sha256(datas)
+                keys = get_md5(datas)
         else:
         else:
             keys = copy.deepcopy(datas)
             keys = copy.deepcopy(datas)
 
 
@@ -58,35 +99,11 @@ class Dedup:
         @param skip_check: 是否直接添加,不检查是否存在 适用于bloomfilter,加快add速度
         @param skip_check: 是否直接添加,不检查是否存在 适用于bloomfilter,加快add速度
         @return: list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
         @return: list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
         """
         """
+
         keys = self._deal_datas(datas)
         keys = self._deal_datas(datas)
-        is_added = self.insert_key(keys, skip_check)
+        is_added = self.dedup.add(keys, skip_check)
 
 
         return is_added
         return is_added
-    def insert_key(self,keys,skip_check):
-        if isinstance(keys, list):
-            for key in keys:
-                if not self.redis_cluster.exists("pylist_"+key):
-                    self.redis_cluster.set("pylist_"+key, 1,ex=86400*365*2)
-        else:
-            if not self.redis_cluster.exists("pylist_"+keys):
-                self.redis_cluster.set("pylist_"+keys,1,ex=86400*365*2)
-
-    def exists(self,keys):
-        exists = []
-        if isinstance(keys, list):
-            for key in keys:
-                exists.append(self.exit_key(key))
-        else:
-            exists.append(self.exit_key(keys))
-        return exists
-    def exit_key(self,key):
-        if self.redis_cluster.exists(key):
-            return True
-        if self.redis_cluster.exists("pylist_"+key):
-            return True
-        return False
-
-
 
 
     def get(self, datas: Union[List[Any], Any]) -> Union[List[Any], Any]:
     def get(self, datas: Union[List[Any], Any]) -> Union[List[Any], Any]:
         """
         """
@@ -95,44 +112,58 @@ class Dedup:
         @return: list / 单个值 (存在返回1 不存在返回0)
         @return: list / 单个值 (存在返回1 不存在返回0)
         """
         """
         keys = self._deal_datas(datas)
         keys = self._deal_datas(datas)
-        is_exists = self.exists(keys)
+        is_exists = self.dedup.get(keys)
 
 
         return is_exists
         return is_exists
 
 
-
     def filter_exist_data(
     def filter_exist_data(
         self,
         self,
         datas: List[Any],
         datas: List[Any],
         *,
         *,
+        datas_fingerprints: Optional[List] = None,
         callback: Callable[[Any], None] = None
         callback: Callable[[Any], None] = None
     ) -> Union[Tuple[List[Any], List[Any]], List[Any]]:
     ) -> Union[Tuple[List[Any], List[Any]], List[Any]]:
         """
         """
         过滤掉已存在的数据
         过滤掉已存在的数据
+        *** 直接修改原来的数据 使用完此方法后 datas, datas_fingerprints 里面的值为去重后的数据
+        @param datas_fingerprints: 数据的唯一指纹 列表
         @param datas: 数据 列表
         @param datas: 数据 列表
         @param callback: 数据已存在时的回调 callback(data)
         @param callback: 数据已存在时的回调 callback(data)
         @return: None
         @return: None
-        [0,1,1]
-        [b,c,d]
-        []
         """
         """
-        is_exists = self.get(datas)
+
+        is_exists = self.get(datas_fingerprints or datas)
+
         dedup_datas = []
         dedup_datas = []
-        while is_exists:
-            data = datas.pop(0)
-            is_exist = is_exists.pop(0)
 
 
-            if not is_exist:
-                dedup_datas.append(data)
-            else:
-                if callback:
-                    callback(data)
-
-        datas.extend(dedup_datas)
-        return datas
-
-if __name__ == '__main__':
-    dedup = Dedup(Dedup.BloomFilter)
-    href = 'http://www.ccgp-tianjin.gov.cn/viewer.do?id=339715380&ver=2222'
-    ss = dedup.filter_exist_data([href])
-    # res = dedup.add([href,'llk'])
-    print(ss)
+        if datas_fingerprints:
+            dedup_datas_fingerprints = []
+            while is_exists:
+                data = datas.pop(0)
+                is_exist = is_exists.pop(0)
+                data_fingerprint = datas_fingerprints.pop(0)
+
+                if not is_exist:
+                    dedup_datas.append(data)
+                    dedup_datas_fingerprints.append(data_fingerprint)
+                else:
+                    if callback:
+                        callback(data)
+
+            datas_fingerprints.extend(dedup_datas_fingerprints)
+            datas.extend(dedup_datas)
+            return datas, datas_fingerprints
+
+        else:
+            while is_exists:
+                data = datas.pop(0)
+                is_exist = is_exists.pop(0)
+
+                if not is_exist:
+                    dedup_datas.append(data)
+                else:
+                    if callback:
+                        callback(data)
+
+            datas.extend(dedup_datas)
+            return datas

+ 14 - 20
FworkSpider/feapder/dedup/bitarray.py

@@ -14,7 +14,7 @@ import threading
 import time
 import time
 from struct import unpack, pack
 from struct import unpack, pack
 
 
-from feapder.db.redisdb import RedisDB
+from feapder.dedup.basefilter import BaseFilter
 from feapder.utils.redis_lock import RedisLock
 from feapder.utils.redis_lock import RedisLock
 from . import bitarray
 from . import bitarray
 
 
@@ -146,24 +146,18 @@ class BloomFilter(object):
         比较耗时 半小时检查一次
         比较耗时 半小时检查一次
         @return:
         @return:
         """
         """
-        # if self._is_at_capacity:
-        #     return self._is_at_capacity
-        #
-        # if not self._check_capacity_time or time.time() - self._check_capacity_time > 1800:
-        #     bit_count = self.bitarray.count()
-        #     if bit_count and bit_count / self.num_bits > 0.5:
-        #         self._is_at_capacity = True
-        #
-        #     self._check_capacity_time = time.time()
-        #
-        # return self._is_at_capacity
-
         if self._is_at_capacity:
         if self._is_at_capacity:
             return self._is_at_capacity
             return self._is_at_capacity
 
 
-        bit_count = self.bitarray.count()
-        if bit_count and bit_count / self.num_bits > 0.5:
-            self._is_at_capacity = True
+        if (
+            not self._check_capacity_time
+            or time.time() - self._check_capacity_time > 1800
+        ):
+            bit_count = self.bitarray.count()
+            if bit_count and bit_count / self.num_bits > 0.5:
+                self._is_at_capacity = True
+
+            self._check_capacity_time = time.time()
 
 
         return self._is_at_capacity
         return self._is_at_capacity
 
 
@@ -174,8 +168,8 @@ class BloomFilter(object):
         @param keys: list or one key
         @param keys: list or one key
         @return:
         @return:
         """
         """
-        if self.is_at_capacity:
-            raise IndexError("BloomFilter is at capacity")
+        # if self.is_at_capacity:
+        #     raise IndexError("BloomFilter is at capacity")
 
 
         is_list = isinstance(keys, list)
         is_list = isinstance(keys, list)
 
 
@@ -197,7 +191,7 @@ class BloomFilter(object):
         return is_added if is_list else is_added[0]
         return is_added if is_list else is_added[0]
 
 
 
 
-class ScalableBloomFilter(object):
+class ScalableBloomFilter(BaseFilter):
     """
     """
     自动扩展空间的bloomfilter, 当一个filter满一半的时候,创建下一个
     自动扩展空间的bloomfilter, 当一个filter满一半的时候,创建下一个
     """
     """
@@ -273,7 +267,7 @@ class ScalableBloomFilter(object):
                     if self.name
                     if self.name
                     else "ScalableBloomFilter"
                     else "ScalableBloomFilter"
                 )
                 )
-                with RedisLock(key=key) as lock:
+                with RedisLock(key=key, redis_url=self.redis_url) as lock:
                     if lock.locked:
                     if lock.locked:
                         while True:
                         while True:
                             if self.filters[-1].is_at_capacity:
                             if self.filters[-1].is_at_capacity:

+ 13 - 2
FworkSpider/feapder/dedup/expirefilter.py

@@ -11,9 +11,10 @@ Created on 2018/12/13 9:44 PM
 import time
 import time
 
 
 from feapder.db.redisdb import RedisDB
 from feapder.db.redisdb import RedisDB
+from feapder.dedup.basefilter import BaseFilter
 
 
 
 
-class ExpireFilter:
+class ExpireFilter(BaseFilter):
     redis_db = None
     redis_db = None
 
 
     def __init__(
     def __init__(
@@ -55,7 +56,17 @@ class ExpireFilter:
         return is_added
         return is_added
 
 
     def get(self, keys):
     def get(self, keys):
-        return self.redis_db.zexists(self.name, keys)
+        is_exist = self.redis_db.zexists(self.name, keys)
+        if isinstance(keys, list):
+            # 判断数据本身是否重复
+            temp_set = set()
+            for i, key in enumerate(keys):
+                if key in temp_set:
+                    is_exist[i] = 1
+                else:
+                    temp_set.add(key)
+
+        return is_exist
 
 
     def del_expire_key(self):
     def del_expire_key(self):
         self.redis_db.zremrangebyscore(
         self.redis_db.zremrangebyscore(

+ 0 - 178
FworkSpider/feapder/dedup/old__init__.py

@@ -1,178 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-12-13 21:08
----------
-@summary: 布隆去重,测试框架使用的去重方式
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import copy
-from typing import Any, List, Union, Optional, Tuple, Callable
-
-from feapder.utils.tools import get_md5
-from .bloomfilter import BloomFilter, ScalableBloomFilter
-from .expirefilter import ExpireFilter
-
-
-class Dedup:
-    BloomFilter = 1
-    MemoryFilter = 2
-    ExpireFilter = 3
-
-    def __init__(self, filter_type: int = BloomFilter, to_md5: bool = True, **kwargs):
-        """
-        去重过滤器 集成BloomFilter、MemoryFilter、ExpireFilter
-        Args:
-            filter_type: 过滤器类型 BloomFilter
-            name: 过滤器名称 该名称会默认以dedup作为前缀 dedup:expire_set:[name]/dedup:bloomfilter:[name]。 默认ExpireFilter name=过期时间; BloomFilter name=dedup:bloomfilter:bloomfilter
-            absolute_name: 过滤器绝对名称 不会加dedup前缀,当此值不为空时name参数无效
-            expire_time: ExpireFilter的过期时间 单位为秒,其他两种过滤器不用指定
-            error_rate: BloomFilter/MemoryFilter的误判率 默认为0.00001
-            to_md5: 去重前是否将数据转为MD5,默认是
-            redis_url: redis://[[username]:[password]]@localhost:6379/0
-                       BloomFilter 与 ExpireFilter 使用
-                       默认会读取setting中的redis配置,若无setting,则需要专递redis_url
-            initial_capacity: 单个布隆过滤器去重容量 默认100000000,当布隆过滤器容量满时会扩展下一个布隆过滤器
-            error_rate:布隆过滤器的误判率 默认0.00001
-            **kwargs:
-        """
-
-        if filter_type == Dedup.ExpireFilter:
-            try:
-                expire_time = kwargs["expire_time"]
-            except:
-                raise ValueError("需传参数 expire_time")
-
-            name = kwargs.get("absolute_name") or "dedup:expire_set:%s" % kwargs.get(
-                "name", expire_time
-            )
-            expire_time_record_key = "dedup:expire_set:expire_time"
-
-            self.dedup = ExpireFilter(
-                name=name,
-                expire_time=expire_time,
-                expire_time_record_key=expire_time_record_key,
-                redis_url=kwargs.get("redis_url"),
-            )
-
-        else:
-            initial_capacity = kwargs.get("initial_capacity", 100000000)
-            error_rate = kwargs.get("error_rate", 0.00001)
-            name = kwargs.get("absolute_name") or "dedup:bloomfilter:" + kwargs.get(
-                "name", "bloomfilter"
-            )
-            if filter_type == Dedup.BloomFilter:
-                self.dedup = ScalableBloomFilter(
-                    name=name,
-                    initial_capacity=initial_capacity,
-                    error_rate=error_rate,
-                    bitarray_type=ScalableBloomFilter.BASE_REDIS,
-                    redis_url=kwargs.get("redis_url"),
-                )
-            elif filter_type == Dedup.MemoryFilter:
-                self.dedup = ScalableBloomFilter(
-                    name=name,
-                    initial_capacity=initial_capacity,
-                    error_rate=error_rate,
-                    bitarray_type=ScalableBloomFilter.BASE_MEMORY,
-                )
-            else:
-                raise ValueError(
-                    "filter_type 类型错误,仅支持 Dedup.BloomFilter、Dedup.MemoryFilter、Dedup.ExpireFilter"
-                )
-
-        self._to_md5 = to_md5
-
-    def __repr__(self):
-        return str(self.dedup)
-
-    def _deal_datas(self, datas):
-        if self._to_md5:
-            if isinstance(datas, list):
-                keys = [get_md5(data) for data in datas]
-            else:
-                keys = get_md5(datas)
-        else:
-            keys = copy.deepcopy(datas)
-
-        return keys
-
-    def add(
-        self, datas: Union[List[Any], Any], skip_check: bool = False
-    ) -> Union[List[Any], Any]:
-        """
-        添加数据
-        @param datas: list / 单个值
-        @param skip_check: 是否直接添加,不检查是否存在 适用于bloomfilter,加快add速度
-        @return: list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
-        """
-
-        keys = self._deal_datas(datas)
-        is_added = self.dedup.add(keys, skip_check)
-
-        return is_added
-
-    def get(self, datas: Union[List[Any], Any]) -> Union[List[Any], Any]:
-        """
-        检查数据是否存在
-        @param datas: list / 单个值
-        @return: list / 单个值 (存在返回1 不存在返回0)
-        """
-        keys = self._deal_datas(datas)
-        is_exists = self.dedup.get(keys)
-
-        return is_exists
-
-    def filter_exist_data(
-        self,
-        datas: List[Any],
-        *,
-        datas_fingerprints: Optional[List] = None,
-        callback: Callable[[Any], None] = None
-    ) -> Union[Tuple[List[Any], List[Any]], List[Any]]:
-        """
-        过滤掉已存在的数据
-        *** 直接修改原来的数据 使用完此方法后 datas, datas_fingerprints 里面的值为去重后的数据
-        @param datas_fingerprints: 数据的唯一指纹 列表
-        @param datas: 数据 列表
-        @param callback: 数据已存在时的回调 callback(data)
-        @return: None
-        """
-
-        is_exists = self.get(datas_fingerprints or datas)
-
-        dedup_datas = []
-
-        if datas_fingerprints:
-            dedup_datas_fingerprints = []
-            while is_exists:
-                data = datas.pop(0)
-                is_exist = is_exists.pop(0)
-                data_fingerprint = datas_fingerprints.pop(0)
-
-                if not is_exist:
-                    dedup_datas.append(data)
-                    dedup_datas_fingerprints.append(data_fingerprint)
-                else:
-                    if callback:
-                        callback(data)
-
-            datas_fingerprints.extend(dedup_datas_fingerprints)
-            datas.extend(dedup_datas)
-            return datas, datas_fingerprints
-
-        else:
-            while is_exists:
-                data = datas.pop(0)
-                is_exist = is_exists.pop(0)
-
-                if not is_exist:
-                    dedup_datas.append(data)
-                else:
-                    if callback:
-                        callback(data)
-
-            datas.extend(dedup_datas)
-            return datas

+ 40 - 73
FworkSpider/feapder/network/__init__.py

@@ -16,15 +16,18 @@ import warnings
 from collections import Iterable
 from collections import Iterable
 from enum import Enum, unique
 from enum import Enum, unique
 
 
+import requests
+from func_timeout import func_set_timeout
+
 import feapder.utils.tools as tools
 import feapder.utils.tools as tools
 from feapder import setting
 from feapder import setting
-from feapder.db.mysqldb import MysqlDB
+from feapder.db.mongodb import MongoDB
 from feapder.db.redisdb import RedisDB
 from feapder.db.redisdb import RedisDB
+from feapder.network import user_agent
 from feapder.utils import metrics
 from feapder.utils import metrics
 from feapder.utils.log import log
 from feapder.utils.log import log
 from feapder.utils.redis_lock import RedisLock
 from feapder.utils.redis_lock import RedisLock
 from feapder.utils.tools import send_msg
 from feapder.utils.tools import send_msg
-from feapder.utils.webdriver import WebDriver
 
 
 
 
 class CookiePoolInterface(metaclass=abc.ABCMeta):
 class CookiePoolInterface(metaclass=abc.ABCMeta):
@@ -101,21 +104,14 @@ class PageCookiePool(CookiePoolInterface):
         可能会重写
         可能会重写
         @return:
         @return:
         """
         """
-        with WebDriver(**self._kwargs) as driver:
-            driver.get(self._page_url)
-            cookies = driver.get_cookies()
-            cookies_json = {}
-            for cookie in cookies:
-                cookies_json[cookie["name"]] = cookie["value"]
-
-            for key in self._must_contained_keys:
-                if key not in cookies_json:
-                    break
-            else:
-                return cookies_json
-
-            log.error("获取cookie失败 cookies = {}".format(cookies_json))
-            return None
+        url = self._page_url
+        header = {
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": user_agent.get()
+        }
+        res = requests.get(url, headers=header)
+        cookies = requests.utils.dict_from_cookiejar(res.cookies)
+        return cookies
 
 
     def add_cookies(self, cookies):
     def add_cookies(self, cookies):
         log.info("添加cookie {}".format(cookies))
         log.info("添加cookie {}".format(cookies))
@@ -126,7 +122,6 @@ class PageCookiePool(CookiePoolInterface):
             try:
             try:
                 now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool)
                 now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool)
                 need_cookie_count = self._min_cookies - now_cookie_count
                 need_cookie_count = self._min_cookies - now_cookie_count
-
                 if need_cookie_count > 0:
                 if need_cookie_count > 0:
                     log.info(
                     log.info(
                         "当前cookie数为 {} 小于 {}, 生产cookie".format(
                         "当前cookie数为 {} 小于 {}, 生产cookie".format(
@@ -141,7 +136,6 @@ class PageCookiePool(CookiePoolInterface):
                         log.exception(e)
                         log.exception(e)
                 else:
                 else:
                     log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count))
                     log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count))
-
                     # 判断cookie池近一分钟数量是否有变化,无变化则认为爬虫不再用了,退出
                     # 判断cookie池近一分钟数量是否有变化,无变化则认为爬虫不再用了,退出
                     last_count_info = self._redisdb.strget(
                     last_count_info = self._redisdb.strget(
                         self._tab_cookie_pool_last_count
                         self._tab_cookie_pool_last_count
@@ -176,6 +170,7 @@ class PageCookiePool(CookiePoolInterface):
                 log.exception(e)
                 log.exception(e)
                 tools.delay_time(1)
                 tools.delay_time(1)
 
 
+    @func_set_timeout(120)
     def get_cookie(self, wait_when_null=True):
     def get_cookie(self, wait_when_null=True):
         while True:
         while True:
             try:
             try:
@@ -184,9 +179,10 @@ class PageCookiePool(CookiePoolInterface):
                     log.info("暂无cookie 生产中...")
                     log.info("暂无cookie 生产中...")
                     self._keep_alive = False
                     self._keep_alive = False
                     self._min_cookies = 1
                     self._min_cookies = 1
-                    with RedisLock(
-                        key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5
-                    ) as _lock:
+                    _lock = RedisLock(key=self._tab_cookie_pool,
+                                      lock_timeout=3600,
+                                      wait_timeout=5)
+                    with _lock:
                         if _lock.locked:
                         if _lock.locked:
                             self.run()
                             self.run()
                     continue
                     continue
@@ -240,25 +236,10 @@ class LoginCookiePool(CookiePoolInterface):
         self._password_key = password_key
         self._password_key = password_key
 
 
         self._redisdb = RedisDB()
         self._redisdb = RedisDB()
-        self._mysqldb = ()
-
-        self.create_userbase()
-
-    def create_userbase(self):
-        sql = f"""
-            CREATE TABLE IF NOT EXISTS `{self._table_userbase}` (
-              `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
-              `{self._username_key}` varchar(50) DEFAULT NULL COMMENT '用户名',
-              `{self._password_key}` varchar(255) DEFAULT NULL COMMENT '密码',
-              `{self._login_state_key}` int(11) DEFAULT '0' COMMENT '登录状态(0未登录 1已登录)',
-              `{self._lock_state_key}` int(11) DEFAULT '0' COMMENT '账号是否被封(0 未封 1 被封)',
-              PRIMARY KEY (`id`),
-              UNIQUE KEY `username` (`username`) USING BTREE
-            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
-        """
-        self._mysqldb.execute(sql)
+        self._mongo = MongoDB(db='user_login')
 
 
     def create_cookie(self, username, password):
     def create_cookie(self, username, password):
+
         """
         """
         创建cookie
         创建cookie
         @param username: 用户名
         @param username: 用户名
@@ -273,15 +254,7 @@ class LoginCookiePool(CookiePoolInterface):
         @return: yield username, password
         @return: yield username, password
         """
         """
 
 
-        sql = "select {username_key}, {password_key} from {table_userbase} where {lock_state_key} != 1 and {login_state_key} != 1".format(
-            username_key=self._username_key,
-            password_key=self._password_key,
-            table_userbase=self._table_userbase,
-            lock_state_key=self._lock_state_key,
-            login_state_key=self._login_state_key,
-        )
-
-        return self._mysqldb.find(sql)
+        return self._mongo.find(self._table_userbase,{self._lock_state_key:0,self._login_state_key:0})
 
 
     def handle_login_failed_user(self, username, password):
     def handle_login_failed_user(self, username, password):
         """
         """
@@ -305,16 +278,13 @@ class LoginCookiePool(CookiePoolInterface):
         user_cookie = {"username": username, "cookie": cookie}
         user_cookie = {"username": username, "cookie": cookie}
 
 
         self._redisdb.lpush(self._tab_cookie_pool, user_cookie)
         self._redisdb.lpush(self._tab_cookie_pool, user_cookie)
+        self._mongo.add(
+                coll_name=self._table_userbase,
+                data={self._login_state_key:1},
+                update_columns=self._username_key,
+                update_columns_value=username)
 
 
-        sql = "update {table_userbase} set {login_state_key} = 1 where {username_key} = '{username}'".format(
-            table_userbase=self._table_userbase,
-            login_state_key=self._login_state_key,
-            username_key=self._username_key,
-            username=username,
-        )
-
-        self._mysqldb.update(sql)
-
+    @func_set_timeout(60)
     def get_cookie(self, wait_when_null=True) -> User:
     def get_cookie(self, wait_when_null=True) -> User:
         while True:
         while True:
             try:
             try:
@@ -342,24 +312,19 @@ class LoginCookiePool(CookiePoolInterface):
         user_info = {"username": user.username, "cookie": user.cookie}
         user_info = {"username": user.username, "cookie": user.cookie}
         self._redisdb.lrem(self._tab_cookie_pool, user_info)
         self._redisdb.lrem(self._tab_cookie_pool, user_info)
 
 
-        sql = "update {table_userbase} set {login_state_key} = 0 where {username_key} = '{username}'".format(
-            table_userbase=self._table_userbase,
-            login_state_key=self._login_state_key,
-            username_key=self._username_key,
-            username=user.username,
-        )
-
-        self._mysqldb.update(sql)
+        self._mongo.add(
+            coll_name=self._table_userbase,
+            data={self._login_state_key: 1},
+            update_columns=self._username_key,
+            update_columns_value=user.username)
 
 
     def user_is_locked(self, user: User):
     def user_is_locked(self, user: User):
-        sql = "update {table_userbase} set {lock_state_key} = 1 where {username_key} = '{username}'".format(
-            table_userbase=self._table_userbase,
-            lock_state_key=self._lock_state_key,
-            username_key=self._username_key,
-            username=user.username,
-        )
 
 
-        self._mysqldb.update(sql)
+        self._mongo.add(
+            coll_name=self._table_userbase,
+            data={self._lock_state_key: 1},
+            update_columns=self._username_key,
+            update_columns_value=user.username)
 
 
     def run(self):
     def run(self):
         with RedisLock(
         with RedisLock(
@@ -373,7 +338,9 @@ class LoginCookiePool(CookiePoolInterface):
                 if not user_infos:
                 if not user_infos:
                     log.info("无可用用户")
                     log.info("无可用用户")
 
 
-                for username, password in user_infos:
+                for info in user_infos:
+                    username = info.get("username")
+                    password = info.get("password")
                     for i in range(self._login_retry_times):
                     for i in range(self._login_retry_times):
                         try:
                         try:
                             cookie = self.create_cookie(username, password)
                             cookie = self.create_cookie(username, password)

+ 0 - 20
FworkSpider/feapder/network/item.py

@@ -1,20 +0,0 @@
-117.88.5.96:8860

-111.179.93.27:8861

-111.179.93.27:8860

-113.226.100.155:8861

-113.226.100.155:8860

-114.99.103.81:8861

-171.13.51.41:8861

-114.99.103.81:8860

-171.13.51.41:8860

-125.41.17.67:8861

-125.41.17.67:8860

-113.123.0.127:8861

-117.88.5.96:8861

-182.101.196.230:8861

-113.123.0.127:8860

-182.101.196.230:8860

-182.34.102.234:8861

-182.34.102.234:8860

-117.88.4.100:8861

-117.88.4.100:8860

+ 0 - 20
FworkSpider/feapder/network/proxy_file/a62f3217a0981b7b2117d9d0af64c2db.txt

@@ -1,20 +0,0 @@
-122.159.219.174:8860&&1653299700
-182.34.19.216:8860&&1653299010
-106.35.223.168:8861&&1653298655
-125.45.91.69:8861&&1653298844
-125.45.91.69:8860&&1653298844
-122.159.219.174:8861&&1653299700
-106.35.223.168:8860&&1653298655
-182.34.19.216:8861&&1653299010
-113.121.20.254:8861&&1653300488
-125.72.106.216:8861&&1653300251
-113.121.20.254:8860&&1653300488
-125.72.106.216:8860&&1653300251
-119.112.80.248:8861&&1653298967
-119.112.80.248:8860&&1653298967
-58.213.26.197:8860&&1653298952
-58.213.26.197:8861&&1653298952
-113.226.110.38:8861&&1653300048
-113.226.110.38:8860&&1653300048
-113.121.41.156:8860&&1653299102
-113.121.41.156:8861&&1653299102

+ 5 - 9
FworkSpider/feapder/network/proxy_pool.py

@@ -1,6 +1,6 @@
-# coding:utf8
+# -*- coding: utf-8 -*-
 """
 """
-代理池  弃用
+代理池
 """
 """
 import datetime
 import datetime
 import json
 import json
@@ -122,9 +122,9 @@ def get_proxy_from_http(proxy_source_url, **kwargs):
         response = requests.get(proxy_source_url, timeout=20)
         response = requests.get(proxy_source_url, timeout=20)
         # 改写:获取scocks代理的response处理
         # 改写:获取scocks代理的response处理
         for proxy in response.json():
         for proxy in response.json():
-            host = decrypt(proxy['host'])
-            port = proxy['port']
-            endTime = proxy['EndTime']
+            host = decrypt(proxy['ip'])
+            port = proxy['ports'][0]
+            endTime = proxy['lifetime']
             pool.append(f"{host}:{port}&&{endTime}")
             pool.append(f"{host}:{port}&&{endTime}")
 
 
         with open(os.path.join(proxy_path, filename), "w") as f:
         with open(os.path.join(proxy_path, filename), "w") as f:
@@ -757,7 +757,3 @@ class ProxyPool(ProxyPoolBase):
         :return:
         :return:
         """
         """
         return get_proxy_from_url(**self.kwargs)
         return get_proxy_from_url(**self.kwargs)
-# 
-# 
-# if __name__ == '__main__':
-#     ProxyPool().get()

+ 23 - 37
FworkSpider/feapder/network/request.py

@@ -7,9 +7,10 @@ Created on 2018-07-25 11:49:08
 @author: Boris
 @author: Boris
 @email:  boris_liu@foxmail.com
 @email:  boris_liu@foxmail.com
 """
 """
+import copy
+import re
 
 
 import requests
 import requests
-from func_timeout import func_set_timeout, FunctionTimedOut
 from requests.adapters import HTTPAdapter
 from requests.adapters import HTTPAdapter
 from requests.cookies import RequestsCookieJar
 from requests.cookies import RequestsCookieJar
 from requests.packages.urllib3.exceptions import InsecureRequestWarning
 from requests.packages.urllib3.exceptions import InsecureRequestWarning
@@ -18,11 +19,10 @@ import feapder.setting as setting
 import feapder.utils.tools as tools
 import feapder.utils.tools as tools
 from feapder.db.redisdb import RedisDB
 from feapder.db.redisdb import RedisDB
 from feapder.network import user_agent
 from feapder.network import user_agent
-from feapder.network.proxy_pool import ProxyPool
 from feapder.network.response import Response
 from feapder.network.response import Response
-from feapder.utils.log import Log
+from feapder.utils.log import log
 from feapder.utils.webdriver import WebDriverPool
 from feapder.utils.webdriver import WebDriverPool
-log = Log()
+
 # 屏蔽warning信息
 # 屏蔽warning信息
 requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
 requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
 
 
@@ -40,7 +40,7 @@ class Request(object):
     oss_handler = None
     oss_handler = None
 
 
     __REQUEST_ATTRS__ = {
     __REQUEST_ATTRS__ = {
-        # 'method', 'url', 必须传递 不加入**kwargs中
+        # "method", "url", 必须传递 不加入**kwargs中
         "params",
         "params",
         "data",
         "data",
         "headers",
         "headers",
@@ -92,6 +92,7 @@ class Request(object):
         render_time=0,
         render_time=0,
         splash=False,
         splash=False,
         iframes=0,
         iframes=0,
+        rel_count=0,
         **kwargs,
         **kwargs,
     ):
     ):
         """
         """
@@ -149,6 +150,7 @@ class Request(object):
         self.render = render
         self.render = render
         self.splash = splash
         self.splash = splash
         self.iframes = iframes
         self.iframes = iframes
+        self.rel_count = rel_count
         self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
         self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
 
 
         self.requests_kwargs = {}
         self.requests_kwargs = {}
@@ -200,7 +202,6 @@ class Request(object):
 
 
         return self.__class__.webdriver_pool
         return self.__class__.webdriver_pool
 
 
-
     @property
     @property
     def to_dict(self):
     def to_dict(self):
         request_dict = {}
         request_dict = {}
@@ -245,7 +246,6 @@ class Request(object):
             else self.callback
             else self.callback
         )
         )
 
 
-    @func_set_timeout(30)
     def get_response(self, save_cached=False):
     def get_response(self, save_cached=False):
         """
         """
         获取带有selector功能的response
         获取带有selector功能的response
@@ -258,7 +258,9 @@ class Request(object):
         )  # connect=22 read=22
         )  # connect=22 read=22
 
 
         # 设置stream
         # 设置stream
-        # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
+        # 默认情况下,当你进行网络请求后,响应体会立即被下载。
+        # stream=True是,调用Response.content 才会下载响应体,默认只返回header。
+        # 缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
         self.requests_kwargs.setdefault("stream", True)
         self.requests_kwargs.setdefault("stream", True)
 
 
         # 关闭证书验证
         # 关闭证书验证
@@ -267,7 +269,7 @@ class Request(object):
         # 设置请求方法
         # 设置请求方法
         method = self.__dict__.get("method")
         method = self.__dict__.get("method")
         if not method:
         if not method:
-            if "data" in self.requests_kwargs:
+            if "data" in self.requests_kwargs or "json" in self.requests_kwargs:
                 method = "POST"
                 method = "POST"
             else:
             else:
                 method = "GET"
                 method = "GET"
@@ -329,7 +331,6 @@ class Request(object):
             )
             )
         )
         )
 
 
-
         use_session = (
         use_session = (
             setting.USE_SESSION if self.use_session is None else self.use_session
             setting.USE_SESSION if self.use_session is None else self.use_session
         )  # self.use_session 优先级高
         )  # self.use_session 优先级高
@@ -338,7 +339,7 @@ class Request(object):
             # 使用request的user_agent、cookies、proxy
             # 使用request的user_agent、cookies、proxy
             user_agent = headers.get("User-Agent") or headers.get("user-agent")
             user_agent = headers.get("User-Agent") or headers.get("user-agent")
             cookies = self.requests_kwargs.get("cookies")
             cookies = self.requests_kwargs.get("cookies")
-            print(cookies)
+            print(f'cookies >>>  {cookies}')
             if cookies and isinstance(cookies, RequestsCookieJar):
             if cookies and isinstance(cookies, RequestsCookieJar):
                 cookies = cookies.get_dict()
                 cookies = cookies.get_dict()
 
 
@@ -347,9 +348,7 @@ class Request(object):
                 if cookie_str:
                 if cookie_str:
                     cookies = tools.get_cookies_from_str(cookie_str)
                     cookies = tools.get_cookies_from_str(cookie_str)
 
 
-
             browser = self._webdriver_pool.get(user_agent=user_agent, proxy=False)
             browser = self._webdriver_pool.get(user_agent=user_agent, proxy=False)
-
             try:
             try:
                 if proxies:
                 if proxies:
                     self.chage_ip(browser)
                     self.chage_ip(browser)
@@ -375,24 +374,21 @@ class Request(object):
                         },
                         },
                     }
                     }
                 )
                 )
-
                 response.browser = browser
                 response.browser = browser
             except Exception as e:
             except Exception as e:
                 self._webdriver_pool.remove(browser)
                 self._webdriver_pool.remove(browser)
                 raise e
                 raise e
-
         elif use_session:
         elif use_session:
             response = self._session.request(method, self.url, **self.requests_kwargs)
             response = self._session.request(method, self.url, **self.requests_kwargs)
             response = Response(response)
             response = Response(response)
         elif self.splash:
         elif self.splash:
-            resp = requests.get(setting.JIANYU_SPLASH_URL, params={
+            resp = requests.get(setting.SWORDFISH_RENDER_URL, params={
                 'iframes': self.iframes,
                 'iframes': self.iframes,
                 'wait': self.render_time,
                 'wait': self.render_time,
                 'html': 1,
                 'html': 1,
-                'proxy': self.get_proxy().get("http"),
+                'proxy': {} if self.proxies == False else self.get_proxy().get("http"),
                 'url': self.url
                 'url': self.url
             })
             })
-
             response = Response(resp)
             response = Response(resp)
 
 
             # if self.iframes:
             # if self.iframes:
@@ -433,7 +429,6 @@ class Request(object):
 
 
         if save_cached:
         if save_cached:
             self.save_cached(response, expire_time=self.__class__.cached_expire_time)
             self.save_cached(response, expire_time=self.__class__.cached_expire_time)
-        log.info("requests",extra={"url":response.url,"code":response.status_code})
         return response
         return response
 
 
     def proxies(self):
     def proxies(self):
@@ -450,19 +445,17 @@ class Request(object):
         """
         """
         proxies = self.proxies()
         proxies = self.proxies()
         if proxies:
         if proxies:
-            return proxies.get("http", "").strip("http://") or proxies.get(
-                "https", ""
-            ).strip("https://")
+            return re.sub(
+                "http.*?//", "", proxies.get("http", "") or proxies.get("https", "")
+            )
 
 
     def get_proxy(self):
     def get_proxy(self):
-        headers = {
-            "Authorization": setting.JIANYU_PROXY_AUTHOR
-        }
-        proxy = requests.get(setting.JIANYU_PROXY_URL, headers=headers).json()
+        headers = {"Authorization": setting.SWORDFISH_PROXY_AUTHOR}
+        proxy = requests.get(setting.SWORDFISH_PROXY_URL, headers=headers).json()
         print(f"切换代理:{proxy.get('data')}")
         print(f"切换代理:{proxy.get('data')}")
         return proxy.get("data")
         return proxy.get("data")
 
 
-    def chage_ip(self,browser):
+    def chage_ip(self, browser):
         ip = self.get_proxy().get("http")  # ip格式"127.0.0.1:80"
         ip = self.get_proxy().get("http")  # ip格式"127.0.0.1:80"
         ip = ip.split("//")[-1]
         ip = ip.split("//")[-1]
         browser.get("about:config")
         browser.get("about:config")
@@ -472,8 +465,7 @@ class Request(object):
         prefs.setIntPref("network.proxy.type", 1);
         prefs.setIntPref("network.proxy.type", 1);
         prefs.setCharPref("network.proxy.socks", "%s");
         prefs.setCharPref("network.proxy.socks", "%s");
         prefs.setIntPref("network.proxy.socks_port", "%s");
         prefs.setIntPref("network.proxy.socks_port", "%s");
-        ''' % (
-        ip.split(':')[0], ip.split(':')[1])
+        ''' % (ip.split(':')[0], ip.split(':')[1])
         # 执行js
         # 执行js
         browser.execute_script(setupScript)
         browser.execute_script(setupScript)
 
 
@@ -542,13 +534,7 @@ class Request(object):
         response_dict = self._cache_db.strget(self._cached_redis_key)
         response_dict = self._cache_db.strget(self._cached_redis_key)
         if not response_dict:
         if not response_dict:
             log.info("无response缓存  重新下载")
             log.info("无response缓存  重新下载")
-            try:
-                response_obj = self.get_response(save_cached=save_cached)
-            except FunctionTimedOut:
-                response_obj = None
-                log.info("请求超时")
-                log.info("requests", extra={"url": self.url, "code": 0})
-
+            response_obj = self.get_response(save_cached=save_cached)
         else:
         else:
             response_dict = eval(response_dict)
             response_dict = eval(response_dict)
             response_obj = Response.from_dict(response_dict)
             response_obj = Response.from_dict(response_dict)
@@ -566,4 +552,4 @@ class Request(object):
         return cls(**request_dict)
         return cls(**request_dict)
 
 
     def copy(self):
     def copy(self):
-        return self.__class__.from_dict(self.to_dict)
+        return self.__class__.from_dict(copy.deepcopy(self.to_dict))

+ 0 - 513
FworkSpider/feapder/network/request6.29.py

@@ -1,513 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-07-25 11:49:08
----------
-@summary: 请求结构体
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import requests
-from func_timeout import func_set_timeout, FunctionTimedOut
-from requests.adapters import HTTPAdapter
-from requests.cookies import RequestsCookieJar
-from requests.packages.urllib3.exceptions import InsecureRequestWarning
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.db.redisdb import RedisDB
-from feapder.network import user_agent
-from feapder.network.proxy_pool import ProxyPool
-from feapder.network.response import Response
-from feapder.utils.log import Log
-from feapder.utils.webdriver import WebDriverPool
-log = Log()
-# 屏蔽warning信息
-requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
-
-
-class Request(object):
-    session = None
-    webdriver_pool: WebDriverPool = None
-    user_agent_pool = user_agent
-    proxies_pool: ProxyPool = None
-
-    cache_db = None  # redis / pika
-    cached_redis_key = None  # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
-    cached_expire_time = 1200  # 缓存过期时间
-
-    local_filepath = None
-    oss_handler = None
-
-    __REQUEST_ATTRS__ = {
-        # 'method', 'url', 必须传递 不加入**kwargs中
-        "params",
-        "data",
-        "headers",
-        "cookies",
-        "files",
-        "auth",
-        "timeout",
-        "allow_redirects",
-        "proxies",
-        "hooks",
-        "stream",
-        "verify",
-        "cert",
-        "json",
-    }
-
-    DEFAULT_KEY_VALUE = dict(
-        url="",
-        retry_times=0,
-        priority=300,
-        parser_name=None,
-        callback=None,
-        filter_repeat=True,
-        auto_request=True,
-        request_sync=False,
-        use_session=None,
-        random_user_agent=True,
-        download_midware=None,
-        is_abandoned=False,
-        render=False,
-        render_time=0,
-    )
-
-    def __init__(
-        self,
-        url="",
-        retry_times=0,
-        priority=300,
-        parser_name=None,
-        callback=None,
-        filter_repeat=True,
-        auto_request=True,
-        request_sync=False,
-        use_session=None,
-        random_user_agent=True,
-        download_midware=None,
-        is_abandoned=False,
-        render=False,
-        render_time=0,
-        **kwargs,
-    ):
-        """
-        @summary: Request参数
-        ---------
-        框架参数
-        @param url: 待抓取url
-        @param retry_times: 当前重试次数
-        @param priority: 优先级 越小越优先 默认300
-        @param parser_name: 回调函数所在的类名 默认为当前类
-        @param callback: 回调函数 可以是函数 也可是函数名(如想跨类回调时,parser_name指定那个类名,callback指定那个类想回调的方法名即可)
-        @param filter_repeat: 是否需要去重 (True/False) 当setting中的REQUEST_FILTER_ENABLE设置为True时该参数生效 默认True
-        @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空,需要自己去请求网页
-        @param request_sync: 是否同步请求下载网页,默认异步。如果该请求url过期时间快,可设置为True,相当于yield的reqeust会立即响应,而不是去排队
-        @param use_session: 是否使用session方式
-        @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
-        @param download_midware: 下载中间件。默认为parser中的download_midware
-        @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
-        @param render: 是否用浏览器渲染
-        @param render_time: 渲染时长,即打开网页等待指定时间后再获取源码
-        --
-        以下参数与requests参数使用方式一致
-        @param method: 请求方式,如POST或GET,默认根据data值是否为空来判断
-        @param params: 请求参数
-        @param data: 请求body
-        @param json: 请求json字符串,同 json.dumps(data)
-        @param headers:
-        @param cookies: 字典 或 CookieJar 对象
-        @param files:
-        @param auth:
-        @param timeout: (浮点或元组)等待服务器数据的超时限制,是一个浮点数,或是一个(connect timeout, read timeout) 元组
-        @param allow_redirects : Boolean. True 表示允许跟踪 POST/PUT/DELETE 方法的重定向
-        @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
-        @param verify: 为 True 时将会验证 SSL 证书
-        @param stream: 如果为 False,将会立即下载响应内容
-        @param cert:
-        --
-        @param **kwargs: 其他值: 如 Request(item=item) 则item可直接用 request.item 取出
-        ---------
-        @result:
-        """
-
-        self.url = url
-        self.retry_times = retry_times
-        self.priority = priority
-        self.parser_name = parser_name
-        self.callback = callback
-        self.filter_repeat = filter_repeat
-        self.auto_request = auto_request
-        self.request_sync = request_sync
-        self.use_session = use_session
-        self.random_user_agent = random_user_agent
-        self.download_midware = download_midware
-        self.is_abandoned = is_abandoned
-        self.render = render
-        self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
-
-        self.requests_kwargs = {}
-        for key, value in kwargs.items():
-            if key in self.__class__.__REQUEST_ATTRS__:  # 取requests参数
-                self.requests_kwargs[key] = value
-
-            self.__dict__[key] = value
-
-    def __repr__(self):
-        try:
-            return "<Request {}>".format(self.url)
-        except:
-            return "<Request {}>".format(str(self.to_dict)[:40])
-
-    def __setattr__(self, key, value):
-        """
-        针对 request.xxx = xxx 的形式,更新reqeust及内部参数值
-        @param key:
-        @param value:
-        @return:
-        """
-        self.__dict__[key] = value
-
-        if key in self.__class__.__REQUEST_ATTRS__:
-            self.requests_kwargs[key] = value
-
-    def __lt__(self, other):
-        return self.priority < other.priority
-
-    @property
-    def _session(self):
-        use_session = (
-            setting.USE_SESSION if self.use_session is None else self.use_session
-        )  # self.use_session 优先级高
-        if use_session and not self.__class__.session:
-            self.__class__.session = requests.Session()
-            # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
-            http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
-            # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
-            self.__class__.session.mount("http", http_adapter)
-
-        return self.__class__.session
-
-    @property
-    def _webdriver_pool(self):
-        if not self.__class__.webdriver_pool:
-            self.__class__.webdriver_pool = WebDriverPool(**setting.WEBDRIVER)
-
-        return self.__class__.webdriver_pool
-
-    @property
-    def _proxies_pool(self):
-        if not self.__class__.proxies_pool:
-            self.__class__.proxies_pool = ProxyPool()
-
-        return self.__class__.proxies_pool
-
-    @property
-    def to_dict(self):
-        request_dict = {}
-
-        self.callback = (
-            getattr(self.callback, "__name__")
-            if callable(self.callback)
-            else self.callback
-        )
-        self.download_midware = (
-            getattr(self.download_midware, "__name__")
-            if callable(self.download_midware)
-            else self.download_midware
-        )
-
-        for key, value in self.__dict__.items():
-            if (
-                key in self.__class__.DEFAULT_KEY_VALUE
-                and self.__class__.DEFAULT_KEY_VALUE.get(key) == value
-                or key == "requests_kwargs"
-            ):
-                continue
-
-            if key in self.__class__.__REQUEST_ATTRS__:
-                if not isinstance(
-                    value, (bytes, bool, float, int, str, tuple, list, dict)
-                ):
-                    value = tools.dumps_obj(value)
-            else:
-                if not isinstance(value, (bytes, bool, float, int, str)):
-                    value = tools.dumps_obj(value)
-
-            request_dict[key] = value
-
-        return request_dict
-
-    @property
-    def callback_name(self):
-        return (
-            getattr(self.callback, "__name__")
-            if callable(self.callback)
-            else self.callback
-        )
-
-    @func_set_timeout(30)
-    def get_response(self, save_cached=False):
-        """
-        获取带有selector功能的response
-        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
-        @return:
-        """
-        # 设置超时默认时间
-        self.requests_kwargs.setdefault(
-            "timeout", setting.REQUEST_TIMEOUT
-        )  # connect=22 read=22
-
-        # 设置stream
-        # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
-        self.requests_kwargs.setdefault("stream", True)
-
-        # 关闭证书验证
-        self.requests_kwargs.setdefault("verify", False)
-
-        # 设置请求方法
-        method = self.__dict__.get("method")
-        if not method:
-            if "data" in self.requests_kwargs:
-                method = "POST"
-            else:
-                method = "GET"
-
-        # 随机user—agent
-        headers = self.requests_kwargs.get("headers", {})
-        if "user-agent" not in headers and "User-Agent" not in headers:
-            if self.render:  # 如果是渲染默认,优先使用WEBDRIVER中配置的ua
-                ua = setting.WEBDRIVER.get(
-                    "user_agent"
-                ) or self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
-            else:
-                ua = self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
-
-            if self.random_user_agent and setting.RANDOM_HEADERS:
-                headers.update({"User-Agent": ua})
-                self.requests_kwargs.update(headers=headers)
-        else:
-            self.requests_kwargs.setdefault(
-                "headers", {"User-Agent": setting.DEFAULT_USERAGENT}
-            )
-
-        # 代理
-        proxies = self.requests_kwargs.get("proxies", -1)
-        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
-            while True:
-                proxies = self._proxies_pool.get()
-                if proxies:
-                    self.requests_kwargs.update(proxies=proxies)
-                    break
-                else:
-                    log.debug("暂无可用代理 ...")
-
-        log.debug(
-            """
-                -------------- %srequest for ----------------
-                url  = %s
-                method = %s
-                body = %s
-                """
-            % (
-                ""
-                if not self.parser_name
-                else "%s.%s "
-                % (
-                    self.parser_name,
-                    (
-                        self.callback
-                        and callable(self.callback)
-                        and getattr(self.callback, "__name__")
-                        or self.callback
-                    )
-                    or "parse",
-                ),
-                self.url,
-                method,
-                self.requests_kwargs,
-            )
-        )
-
-        # def hooks(response, *args, **kwargs):
-        #     print(response.url)
-        #
-        # self.requests_kwargs.update(hooks={'response': hooks})
-
-        use_session = (
-            setting.USE_SESSION if self.use_session is None else self.use_session
-        )  # self.use_session 优先级高
-
-        if self.render:
-            # 使用request的user_agent、cookies、proxy
-            user_agent = headers.get("User-Agent") or headers.get("user-agent")
-            cookies = self.requests_kwargs.get("cookies")
-            print(cookies)
-            if cookies and isinstance(cookies, RequestsCookieJar):
-                cookies = cookies.get_dict()
-
-            if not cookies:
-                cookie_str = headers.get("Cookie") or headers.get("cookie")
-                if cookie_str:
-                    cookies = tools.get_cookies_from_str(cookie_str)
-
-            proxy = None
-            if proxies and proxies != -1:
-                proxy = proxies.get("http", "").strip("http://") or proxies.get(
-                    "https", ""
-                ).strip("https://")
-
-            browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy)
-
-            try:
-                browser.get(self.url)
-                if cookies:
-                    browser.cookies = cookies
-                if self.render_time:
-                    tools.delay_time(self.render_time)
-
-                html = browser.page_source
-                response = Response.from_dict(
-                    {
-                        "url": browser.current_url,
-                        "cookies": browser.cookies,
-                        "_content": html.encode(),
-                        "status_code": 200,
-                        "elapsed": 666,
-                        "headers": {
-                            "User-Agent": browser.execute_script(
-                                "return navigator.userAgent"
-                            ),
-                            "Cookie": tools.cookies2str(browser.cookies),
-                        },
-                    }
-                )
-
-                response.browser = browser
-            except Exception as e:
-                self._webdriver_pool.remove(browser)
-                raise e
-
-        elif use_session:
-            response = self._session.request(method, self.url, **self.requests_kwargs)
-            response = Response(response)
-        else:
-            response = requests.request(method, self.url, **self.requests_kwargs)
-            response = Response(response)
-
-        if save_cached:
-            self.save_cached(response, expire_time=self.__class__.cached_expire_time)
-        log.info("requests",extra={"url":response.url,"code":response.status_code})
-        return response
-
-    def proxies(self):
-        """
-
-        Returns: {"https": "https://ip:port", "http": "http://ip:port"}
-
-        """
-        return self.requests_kwargs.get("proxies")
-
-    def proxy(self):
-        """
-
-        Returns: ip:port
-
-        """
-        proxies = self.proxies()
-        if proxies:
-            return proxies.get("http", "").strip("http://") or proxies.get(
-                "https", ""
-            ).strip("https://")
-
-    def user_agent(self):
-        headers = self.requests_kwargs.get("headers")
-        if headers:
-            return headers.get("user_agent") or headers.get("User-Agent")
-
-    @property
-    def fingerprint(self):
-        """
-        request唯一表识
-        @return:
-        """
-        url = self.__dict__.get("url", "")
-        # url 归一化
-        url = tools.canonicalize_url(url)
-        args = [url]
-
-        for arg in ["params", "data", "files", "auth", "cert", "json"]:
-            if self.requests_kwargs.get(arg):
-                args.append(self.requests_kwargs.get(arg))
-
-        return tools.get_md5(*args)
-
-    @property
-    def _cache_db(self):
-        if not self.__class__.cache_db:
-            self.__class__.cache_db = RedisDB()  # .from_url(setting.pika_spider_1_uri)
-
-        return self.__class__.cache_db
-
-    @property
-    def _cached_redis_key(self):
-        if self.__class__.cached_redis_key:
-            return (
-                f"response_cached:{self.__class__.cached_redis_key}:{self.fingerprint}"
-            )
-        else:
-            return f"response_cached:test:{self.fingerprint}"
-
-    def save_cached(self, response, expire_time=1200):
-        """
-        使用redis保存response 用于调试 不用每回都下载
-        @param response:
-        @param expire_time: 过期时间
-        @return:
-        """
-
-        self._cache_db.strset(self._cached_redis_key, response.to_dict, ex=expire_time)
-
-    def get_response_from_cached(self, save_cached=True):
-        """
-        从缓存中获取response
-        注意:
-            属性值为空:
-                -raw : urllib3.response.HTTPResponse
-                -connection:requests.adapters.HTTPAdapter
-                -history
-
-            属性含义改变:
-                - request 由requests 改为Request
-        @param: save_cached 当无缓存 直接下载 下载完是否保存缓存
-        @return:
-        """
-        response_dict = self._cache_db.strget(self._cached_redis_key)
-        if not response_dict:
-            log.info("无response缓存  重新下载")
-            try:
-                response_obj = self.get_response(save_cached=save_cached)
-            except FunctionTimedOut:
-                log.info("请求超时")
-                log.info("requests", extra={"url": self.url, "code": 0})
-
-        else:
-            response_dict = eval(response_dict)
-            response_obj = Response.from_dict(response_dict)
-        return response_obj
-
-    def del_response_cached(self):
-        self._cache_db.clear(self._cached_redis_key)
-
-    @classmethod
-    def from_dict(cls, request_dict):
-        for key, value in request_dict.items():
-            if isinstance(value, bytes):  # 反序列化 如item
-                request_dict[key] = tools.loads_obj(value)
-
-        return cls(**request_dict)
-
-    def copy(self):
-        return self.__class__.from_dict(self.to_dict)

+ 2 - 1
FworkSpider/feapder/network/response.py

@@ -14,4 +14,5 @@ redis-py-cluster>=2.1.0
 cryptography>=3.3.2
 cryptography>=3.3.2
 urllib3>=1.25.8
 urllib3>=1.25.8
 loguru>=0.5.3
 loguru>=0.5.3
-influxdb>=5.3.1
+influxdb>=5.3.1
+func-timeout==4.3.5

+ 6 - 4
FworkSpider/feapder/setting.py

@@ -4,9 +4,9 @@ import os
 
 
 # redis 表名
 # redis 表名
 # 任务表模版
 # 任务表模版
-TAB_REQUSETS = "{redis_key}:z_requsets"
+TAB_REQUESTS = "{redis_key}:z_requests"
 # 任务失败模板
 # 任务失败模板
-TAB_FAILED_REQUSETS = "{redis_key}:z_failed_requsets"
+TAB_FAILED_REQUESTS = "{redis_key}:z_failed_requests"
 # 数据保存失败模板
 # 数据保存失败模板
 TAB_FAILED_ITEMS = "{redis_key}:s_failed_items"
 TAB_FAILED_ITEMS = "{redis_key}:s_failed_items"
 # 爬虫状态表模版
 # 爬虫状态表模版
@@ -78,6 +78,8 @@ WEBDRIVER = dict(
 
 
 # 爬虫启动时,重新抓取失败的requests
 # 爬虫启动时,重新抓取失败的requests
 RETRY_FAILED_REQUESTS = False
 RETRY_FAILED_REQUESTS = False
+# 爬虫启动时,重新入库失败的item
+RETRY_FAILED_ITEMS = False
 # 保存失败的request
 # 保存失败的request
 SAVE_FAILED_REQUEST = True
 SAVE_FAILED_REQUEST = True
 # request防丢机制。(指定的REQUEST_LOST_TIMEOUT时间内request还没做完,会重新下发 重做)
 # request防丢机制。(指定的REQUEST_LOST_TIMEOUT时间内request还没做完,会重新下发 重做)
@@ -111,11 +113,11 @@ USE_SESSION = False
 # 去重
 # 去重
 ITEM_FILTER_ENABLE = False  # item 去重
 ITEM_FILTER_ENABLE = False  # item 去重
 ITEM_FILTER_SETTING = dict(
 ITEM_FILTER_SETTING = dict(
-    filter_type=1  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
+    filter_type=1  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、轻量去重(LiteFilter)= 4、集群去重(SwordFishFilter)= 5
 )
 )
 REQUEST_FILTER_ENABLE = False  # request 去重
 REQUEST_FILTER_ENABLE = False  # request 去重
 REQUEST_FILTER_SETTING = dict(
 REQUEST_FILTER_SETTING = dict(
-    filter_type=3,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
+    filter_type=3,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、 轻量去重(LiteFilter)= 4、集群去重(SwordFishFilter)= 5
     expire_time=2592000,  # 过期时间1个月
     expire_time=2592000,  # 过期时间1个月
 )
 )
 
 

+ 98 - 30
FworkSpider/feapder/templates/air_spider_template.tmpl

@@ -6,20 +6,22 @@ Created on {DATE}
 ---------
 ---------
 @author: {USER}
 @author: {USER}
 """
 """
+import re
 import sys
 import sys
 sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 import time
 import time
 from urllib.parse import urljoin
 from urllib.parse import urljoin
-
 import feapder
 import feapder
 from feapder.utils.tools import wechat_warning
 from feapder.utils.tools import wechat_warning
+from untils.attachment import AttachmentDownloader
 import execjs
 import execjs
 from items.spider_item import DataBakItem, MgpListItem
 from items.spider_item import DataBakItem, MgpListItem
 from feapder.db.mongodb import MongoDB
 from feapder.db.mongodb import MongoDB
+from feapder.utils.log import log
 
 
 
 
 
 
-class ${spider_name}(feapder.Spider):
+class Details(feapder.Spider):
     _to_db = None
     _to_db = None
     db_name = 'mgp_list'
     db_name = 'mgp_list'
     send_list = []
     send_list = []
@@ -32,51 +34,122 @@ class ${spider_name}(feapder.Spider):
 
 
     def start_requests(self):
     def start_requests(self):
         while True:
         while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"${spider_name}"},sort={"failed":-1},limit=50)
+            data_lsit = self.to_db.find(self.db_name,{"parser_name":"${spider_name}"},limit=50)
             for item in data_lsit:
             for item in data_lsit:
+                log.debug(item.get("item"))
                 request_params = item.get("request_params")
                 request_params = item.get("request_params")
-
-                '''可自定义'''
-
-                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
-                                      deal_detail=item.get("deal_detail"),**request_params,
-                                      callback=eval(item.get("parse")),base_info=item,proxies=item.get("proxies"))
-                self.to_db.delete(self.db_name,item)
+                is_join_html = item.get("is_join_html")          # 正文是否根据xpath拼接
+                extra_html = item.get("extra_html")              # 过滤无效内容
+                if item.get("proxies"):
+                    yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files_info=item.get("files"),
+                                          deal_detail=item.get("deal_detail"),is_join_html=is_join_html,extra_html=extra_html,
+                                          callback=eval(item.get("parse")),base_info=item,**request_params)
+                else:
+                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files_info=item.get("files"),
+                                          deal_detail=item.get("deal_detail"),is_join_html=is_join_html,extra_html=extra_html,
+                                          callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
+                self.to_db.delete(self.db_name, {"_id": item.get("_id")})
             break
             break
 
 
     def detail_get(self,request,response):
     def detail_get(self,request,response):
-        '''需自定义解析规则'''
+
         items = request.item
         items = request.item
         list_item = DataBakItem()
         list_item = DataBakItem()
         for key in items:
         for key in items:
             list_item.__setitem__(key,items[key])
             list_item.__setitem__(key,items[key])
+
         html = ''
         html = ''
-        # for xpath in request.deal_detail:
-        #    html = response.xpath(xpath).extract_first()
-        #    if html is not None:
-        #        break
+        for xpath in request.deal_detail:
+            html = response.xpath(xpath).extract_first()  # 标书详细内容
+            if request.is_join_html:
+                if html is not None:
+                    html += html
+            else:
+                if html is not None:
+                    break
+
+        extra_html_info = request.extra_html
+        if html and extra_html_info:
+            for extra_item in extra_html_info:
+                if re.search('^//.*', extra_item):
+                    extra_html = response.xpath(extra_item).extract_first()
+                else:
+                    extra_html = extra_item
+                html = html.replace(extra_html,'')
 
 
         list_item.contenthtml = html
         list_item.contenthtml = html
-        # if request.files:
-        #     files_info = request.files
-        #     files =  response.xpath(files_info.get("xpath")).extract()
-        #     for file_url in files:
-        #         if files_info.get("host"):
-        #             file_url = urljoin(files_info.get("host"), file_url)
-        #         if file_url.split(".")[-1] in files.get("other_files"):
-        #             continue
+
+        if request.files_info:      # 附件下载
+            files_info = request.files_info
+            files = response.xpath(files_info.get("list_xpath"))
+            if len(files)>0:
+                attachments = {}
+                for info in files:
+                    file_url = info.xpath(files_info.get("url_xpath")).extract_first()
+                    file_name = info.xpath(files_info.get("name_xpath")).extract_first()
+                    if not file_name:
+                        file_name = info.xpath(files_info.get("name_xpath")).extract()
+                    if file_name:
+                        file_name = "".join("".join(file_name).split()).strip()
+                        if files_info.get("host"):
+                            file_url = urljoin(files_info.get("host"), file_url)
+                        if not files_info.get("file_type"):
+                            file_type = file_url.split("?")[0].split(".")[-1].lower()
+                            if file_type not in files_info.get("files_type"):
+                                file_type = file_name.split("?")[0].split(".")[-1].lower()
+
+                        if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
+                            attachment = AttachmentDownloader().fetch_attachment(
+                                file_name=file_name,file_type=file_type,download_url=file_url,
+                                enable_proxy=False)
+                            attachments[str(len(attachments)+1)] = attachment
+                if len(attachments)==0:
+                    pass
+                else:
+                    list_item.projectinfo={"attachments":attachments}
+
         yield list_item
         yield list_item
 
 
 
 
+    def detail_json(self,request,response):
+
+        items = request.item
+        list_item = DataBakItem()
+        for key in items:
+            list_item.__setitem__(key,items[key])
+
+        exec(request.deal_detail)
+
+        yield list_item
+
 
 
     def failed_request(self, request, response):
     def failed_request(self, request, response):
         '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
         '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
+        if response is None:
+            code = 0
+        else:
+            code = response.status_code
+        err_dic = {"200":"analysis","400":"download","500":"servers","300":"download"}
+        if 200<=code<300:
+            err = 'analysis'
+        elif 300<=code<400:
+            err = 'download'
+        elif 400<=code<500:
+            err = 'download'
+        elif 500<=code:
+            err = "servers"
+        else:
+            err = "timeout"
         mgp = MgpListItem()
         mgp = MgpListItem()
+        mgp.code=code
+        mgp.error=err
         items = request.base_info
         items = request.base_info
         for key in items:
         for key in items:
             mgp.__setitem__(key,items[key])
             mgp.__setitem__(key,items[key])
         mgp.failed +=1
         mgp.failed +=1
-        print(f'......{mgp.failed}')
+        if mgp.pri is None:
+            mgp.pri = 0
+
         if mgp.pri > 5:
         if mgp.pri > 5:
             if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
             if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
                 if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
                 if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
@@ -96,10 +169,5 @@ class ${spider_name}(feapder.Spider):
         yield mgp
         yield mgp
 
 
 
 
-    def end_callback(self):
-        print("爬虫结束")
-
-
-
 if __name__ == "__main__":
 if __name__ == "__main__":
-    Details(redis_key="fwork:details1").start()
+    Details(redis_key="{USER}:${spider_name}").start()

+ 61 - 45
FworkSpider/feapder/templates/project_template/CHECK_DATA.md

@@ -9,80 +9,96 @@ Created on {DATE}
 import sys
 import sys
 sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 import feapder
 import feapder
-from items.spider_item import DataBakItem,MgpListItem,ListItem
+from items.spider_item import DataBakItem,MgpListItem
 from feapder.dedup import Dedup
 from feapder.dedup import Dedup
 from collections import namedtuple
 from collections import namedtuple
 
 
 
 
+
 class ${spider_name}(feapder.Spider):
 class ${spider_name}(feapder.Spider):
 
 
     def start_callback(self):
     def start_callback(self):
+
+         self.site = ""
+
+         #               --- --- crawl_page 必须存在,且为纯数字(int) --- ---
          Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
          Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
 
 
          self.menus = [
          self.menus = [
              Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
              Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
-             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "Notice", 1),
+             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
          ]
          ]
+
+         self.headers = {}
+
     def start_requests(self):
     def start_requests(self):
          for menu in self.menus:
          for menu in self.menus:
-             for page in range(1,menu.crawl_page+1):
-                 start_url = f''
-                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False)
+             start_url = ''
+             yield feapder.Request(url=start_url,item=menu._asdict(),page=1,real_page=0,proxies=False)
+
 
 
     def parse(self, request, response):
     def parse(self, request, response):
+        real_count = 0
         menu = request.item
         menu = request.item
         dedup = Dedup(Dedup.BloomFilter)
         dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = []
+        info_list = response.xpath('')       # 数据结构为html
         for info in info_list:
         for info in info_list:
-            href = ''
-            title = ''
-            create_time = ''
+            href = info.xpath('').extract_first().strip()
+            title = info.xpath('').extract_first().strip()
+            publish_time = info.xpath('').extract_first().strip()
 
 
-            data_item = DataBakItem()  # 存储数据的管道
-            data_item.href = href  # 标书链接
+            area = ""   # 省份
+            city = ""   # 城市
+
+            data_item = DataBakItem()                # 存储数据的管道
+            data_item.href = href                    # 标书链接
             data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
             data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
             data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
             data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
-            data_item.publishtime = create_time  # 标书发布时间
-            data_item.site = "*******记得编辑平台名称"
-            data_item.area = "全国"  # 城市默认:全国
-            data_item.city = ""  # 城市 默认为空
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
+            data_item.title = title                  # 标题
+            data_item.publishtime = publish_time     # 标书发布时间
+            data_item.site = self.site
+            data_item.area = area or "全国"           # 省份 默认:全国
+            data_item.city = city                    # 城市 默认 为空
+
+            undedup_data = dedup.filter_exist_data([href])    # 去重
+            if undedup_data == []:
                 continue
                 continue
+
             list_item =  MgpListItem()
             list_item =  MgpListItem()
-            list_item.parse = "self.detail_get"
-            list_item.parser_name = "details"
+            list_item.parse = "self.detail_get"      # 详情页回调方法
+            list_item.parser_name = "details"        # 详情页标识 默认通用详情页
             list_item.item = data_item.to_dict
             list_item.item = data_item.to_dict
-            list_item.deal_detail = ['//div[@class="****"]',"*****"]
+            list_item.deal_detail = ['//div[@class="****"]']   # 抽取正文xpath
             list_item.proxies = False
             list_item.proxies = False
-            list_item.parse_url = href
-            list_item.pri = 1
-            list.files={
-                "list_xpath":'//div[@class="notice-foot"]/a',
+            list_item.parse_url = href               # 详情页请求地址
+            list_item.pri = 1                        # 执行等级
+
+            list_item.files={                        # 附件采集规则
+                "list_xpath":'//div[@class="***"]//a[@href]',
                 "url_xpath":'./@href',
                 "url_xpath":'./@href',
                 "name_xpath":'./text()',
                 "name_xpath":'./text()',
-                "files_type":('zip','doxc','ftp'),
-                "file_type":'zip',
-                "url_key":'attachmentDownload',
-                # "host":'http',
-                "kwargs":{"headers": {
-                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
-                }}
-            href_list.append(href)
+                "files_type":('zip','docx','ftp','pdf','doc','rar','gzzb',
+                              'jpg','png','zbid','xls','xlsx','swp','dwg'), # 需要下载的附件类型
+                #"file_type":'pdf',                  # 默认的附件类型,用于url中未带附件类型的
+                "url_key":'http',                    # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
+                "host":'',                           # 需要拼接url的host
+            }
+
+            dedup.add(href)
             yield list_item
             yield list_item
-        list = ListItem()
-        list.site = self.site
-        list.channel = menu.get("channel")
-        list.spidercode = menu.get("code")
-        list.url = request.url
-        list.count = len(info_list)
-        list.rel_count = len(href_list)
-        dedup.add(href_list)
-
-    def end_callback(self):
-        print("爬虫结束")
+            real_count += 1
+
+
+
+        # 无限翻页
+
+        request = self.infinite_pages(request,response)
+        yield request
+
+    def download_midware(self, request):
+        page = request.page
+        request.headers = self.headers
+
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     ${spider_name}(redis_key="{USER}:${spider_name}").start()
     ${spider_name}(redis_key="{USER}:${spider_name}").start()

+ 0 - 177
FworkSpider/feapder/utils/__init__.py

@@ -1,177 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021/3/18 12:39 上午
----------
-@summary:  阿里云附件上传
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import hashlib
-import os
-import traceback
-import oss2
-import requests
-from feapder import setting
-import time
-
-class UploadOSS:
-    """阿里云 oss"""
-
-    def __init__(self):
-        oss_conf = setting.oss_
-        self.file_path: str = ""
-        self.file_stream: bytes = b''
-        self.__acc_key_id = oss_conf['key_id']
-        self.__acc_key_secret = oss_conf['key_secret']
-        self.__endpoint = oss_conf['endpoint']
-        self.__bucket_name = oss_conf['bucket_name']
-
-    @property
-    def fid(self):
-        """
-        文本摘要值
-
-        @return: 十六进制摘要值
-        """
-        sha1 = hashlib.sha1()
-        sha1.update(str(self.file_stream).encode("utf-8"))
-        return sha1.hexdigest()
-
-    @property
-    def file_size(self):
-        """
-        文件的大小,将字节(bytes)转化(kb/M/G单位)
-
-        @return: 文件大小
-        """
-        try:
-            size = os.path.getsize(self.file_path)
-        except Exception:
-            traceback.print_exc()
-        else:
-            try:
-                _kb = float(size) / 1024
-            except:
-                return "Error"
-            else:
-                if _kb >= 1024:
-                    _M = _kb / 1024
-                    if _M >= 1024:
-                        _G = _M / 1024
-                        return "{:.1f} G".format(_G)
-                    else:
-                        return "{:.1f} M".format(_M)
-                else:
-                    return "{:.1f} kb".format(_kb)
-
-    def get_state(self, attachment,count=0, **kwargs):
-        """
-        下载附件并上传阿里oss
-
-        @param attachment: 附件
-        @return: 附件处理结果
-        """
-        request_params = {
-            'headers': setting.headers,
-            'timeout': 20,
-            'stream': True,
-            **kwargs
-        }
-        with requests.get(attachment["org_url"], **request_params) as req:
-            if req.status_code == 200:
-                self.file_stream = req.content
-                # img_dir = "file"
-                img_dir = f"file/{attachment['channel']}"
-                # 文件夹不存在则创建文件夹
-                if not os.path.exists(img_dir):
-                    os.makedirs(img_dir, mode=0o777, exist_ok=True)
-                # 打开目录,放入下载的附件
-                filname = hashlib.md5(attachment["filename"].encode("utf-8"))
-                filname = filname.hexdigest() #加密1次
-                types = attachment["ftype"]
-                self.file_path = "{}/{}".format(img_dir, filname+'.'+types)
-                with open(self.file_path, 'wb') as f:
-                    f.write(self.file_stream)
-                # 上传附件
-                self.put_oss_from_local()
-                file_state = self.file_state(attachment)
-                # 删除附件
-                os.remove(self.file_path)
-                # 返回附件上传处理信息
-                return file_state
-            else:
-                if count<3:
-                    self.post_state(attachment,count=count+1, **kwargs)
-                else:
-                    # attachment["ftype"] = str(attachment["filename"]).split(".")[1]
-                    attachment["url"] = 'oss'
-                    attachment["fid"] = self.fid + "." + attachment["ftype"]
-                    attachment["size"] = '0kb'
-                    attachment["false"] = True
-                    return attachment
-    def post_state(self, attachment,count=0, **kwargs):
-        """
-        下载附件并上传阿里oss
-
-        @param attachment: 附件
-        @return: 附件处理结果
-        """
-        request_params = {
-            'headers': setting.headers,
-            'timeout': 20,
-            'stream': True,
-            **kwargs
-        }
-        with requests.post(attachment["org_url"], **request_params) as req:
-            if req.status_code == 200:
-                self.file_stream = req.content
-                img_dir = f"file/{attachment['channel']}"
-                # 文件夹不存在则创建文件夹
-                if not os.path.exists(img_dir):
-                    os.makedirs(img_dir, mode=0o777, exist_ok=True)
-                # 打开目录,放入下载的附件
-                filname = hashlib.md5(attachment["filename"].encode("utf-8"))
-                filname = filname.hexdigest()  # 加密1次
-                types = attachment["ftype"]
-                self.file_path = "{}/{}".format(img_dir, filname + '.' + types)
-
-                with open(self.file_path, 'wb') as f:
-                    f.write(self.file_stream)
-                # 上传附件
-                self.put_oss_from_local()
-                file_state = self.file_state(attachment)
-                # 删除附件
-                # os.remove(self.file_path)
-                # 返回附件上传处理信息
-                return file_state
-            else:
-                if count<3:
-                    self.post_state(attachment,count=count+1, **kwargs)
-                else:
-                    attachment["url"] = 'oss'
-                    attachment["fid"] = self.fid + "." + attachment["ftype"]
-                    attachment["size"] = '0kb'
-                    attachment["false"] = True
-                    return attachment
-
-    def put_oss_from_local(self):
-        """上传一个本地文件到阿里OSS的普通文件"""
-        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
-        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
-        bucket.put_object_from_file(self.fid, self.file_path)
-
-    def file_state(self, attachment):
-        """
-        文件信息
-
-        @param attachment: 附件
-        @return: 附件上传处理信息
-        """
-        # attachment["ftype"] = str(attachment["filename"]).split(".")[1]
-        attachment["url"] = 'oss'
-        attachment["fid"] = self.fid + "." + attachment["ftype"]
-        attachment["size"] = self.file_size
-        return attachment
-
-

+ 1 - 1
FworkSpider/feapder/utils/custom_argparse.py

@@ -2,7 +2,7 @@
 """
 """
 Created on 2020/2/19 12:57 PM
 Created on 2020/2/19 12:57 PM
 ---------
 ---------
-@summary: 邮件发送
+@summary:
 ---------
 ---------
 @author: Boris
 @author: Boris
 @email: boris_liu@foxmail.com
 @email: boris_liu@foxmail.com

+ 53 - 41
FworkSpider/feapder/utils/js/stealth.min.js

@@ -10,11 +10,10 @@ Created on 2018-12-08 16:50
 import logging
 import logging
 import os
 import os
 import sys
 import sys
-import time
 from logging.handlers import BaseRotatingHandler
 from logging.handlers import BaseRotatingHandler
 
 
+import logstash
 import loguru
 import loguru
-import pymongo
 from better_exceptions import format_exception
 from better_exceptions import format_exception
 
 
 import feapder.setting as setting
 import feapder.setting as setting
@@ -41,47 +40,46 @@ class RotatingFileHandler(BaseRotatingHandler):
         self.max_bytes = max_bytes
         self.max_bytes = max_bytes
         self.backup_count = backup_count
         self.backup_count = backup_count
         self.placeholder = str(len(str(backup_count)))
         self.placeholder = str(len(str(backup_count)))
-        self._to_db = None
-        self.filename = filename
-
-
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = pymongo.MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
-
-        return self._to_db.pyspider
 
 
+    def doRollover(self):
+        if self.stream:
+            self.stream.close()
+            self.stream = None
+        if self.backup_count > 0:
+            for i in range(self.backup_count - 1, 0, -1):
+                sfn = ("%0" + self.placeholder + "d.") % i  # '%2d.'%i -> 02
+                sfn = sfn.join(self.baseFilename.split("."))
+                # sfn = "%d_%s" % (i, self.baseFilename)
+                # dfn = "%d_%s" % (i + 1, self.baseFilename)
+                dfn = ("%0" + self.placeholder + "d.") % (i + 1)
+                dfn = dfn.join(self.baseFilename.split("."))
+                if os.path.exists(sfn):
+                    # print "%s -> %s" % (sfn, dfn)
+                    if os.path.exists(dfn):
+                        os.remove(dfn)
+                    os.rename(sfn, dfn)
+            dfn = (("%0" + self.placeholder + "d.") % 1).join(
+                self.baseFilename.split(".")
+            )
+            if os.path.exists(dfn):
+                os.remove(dfn)
+            # Issue 18940: A file may not have been created if delay is True.
+            if os.path.exists(self.baseFilename):
+                os.rename(self.baseFilename, dfn)
+        if not self.delay:
+            self.stream = self._open()
 
 
     def shouldRollover(self, record):
     def shouldRollover(self, record):
-        parmars = {
-            "spider_name":record.name,
-            "msg":record.msg,
-            "Message":str(record.getMessage)
-        }
-        if record.levelname == "ERROR":
-            crawl_type = 'list'
-            if 'detail' in record.name:
-                crawl_type = 'detail'
-            url = ''
-            item={
-                "recordname":record.name,
-                "spidercode":"spidercode",
-                "author":self.filename,
-                "account":"",
-                "crawl_time":time.time(),
-                "crawl_type": crawl_type,
-                "status_code":"status_code",
-                "url":url,
-                "reason":record.msg,
-                'parmars': parmars,
-            }
-
-            # print('<<<<<<<<<<<<<<<<<<<<<<<插入error_info')
-            # print(item)
-            # print(self.to_db.error_info)
-            # self.to_db.error_info.insert_one(item)
 
 
+        if self.stream is None:  # delay was set...
+            self.stream = self._open()
+        if self.max_bytes > 0:  # are we rolling over?
+            # print('record >>>> ', record)
+            msg = "%s\n" % self.format(record)
+            self.stream.seek(0, 2)  # due to non-posix-compliant Windows feature
+            if self.stream.tell() + len(msg) >= self.max_bytes:
+                return 1
+        return 0
 
 
 
 
 def get_logger(
 def get_logger(
@@ -90,6 +88,7 @@ def get_logger(
     log_level=None,
     log_level=None,
     is_write_to_console=None,
     is_write_to_console=None,
     is_write_to_file=None,
     is_write_to_file=None,
+    is_send_to_logstash = None,
     color=None,
     color=None,
     mode=None,
     mode=None,
     max_bytes=None,
     max_bytes=None,
@@ -113,6 +112,7 @@ def get_logger(
     @result:
     @result:
     """
     """
     # 加载setting里最新的值
     # 加载setting里最新的值
+    # name = os.path.split(os.getcwd())[-1]
     name = name or setting.LOG_NAME
     name = name or setting.LOG_NAME
     path = path or setting.LOG_PATH
     path = path or setting.LOG_PATH
     log_level = log_level or setting.LOG_LEVEL
     log_level = log_level or setting.LOG_LEVEL
@@ -126,6 +126,13 @@ def get_logger(
         if is_write_to_file is not None
         if is_write_to_file is not None
         else setting.LOG_IS_WRITE_TO_FILE
         else setting.LOG_IS_WRITE_TO_FILE
     )
     )
+
+    is_send_to_logstash = (
+        is_send_to_logstash
+        if is_send_to_logstash is not None
+        else setting.LOG_IS_SEND_TO_LOGSTASH
+    )
+
     color = color if color is not None else setting.LOG_COLOR
     color = color if color is not None else setting.LOG_COLOR
     mode = mode or setting.LOG_MODE
     mode = mode or setting.LOG_MODE
     max_bytes = max_bytes or setting.LOG_MAX_BYTES
     max_bytes = max_bytes or setting.LOG_MAX_BYTES
@@ -144,8 +151,8 @@ def get_logger(
 
 
     # 定义一个RotatingFileHandler,最多备份5个日志文件,每个日志文件最大10M
     # 定义一个RotatingFileHandler,最多备份5个日志文件,每个日志文件最大10M
     if is_write_to_file:
     if is_write_to_file:
-        # if path and not os.path.exists(os.path.dirname(path)):
-        #     os.makedirs(os.path.dirname(path))
+        if path and not os.path.exists(os.path.dirname(path)):
+            os.makedirs(os.path.dirname(path))
 
 
         rf_handler = RotatingFileHandler(
         rf_handler = RotatingFileHandler(
             path,
             path,
@@ -156,11 +163,16 @@ def get_logger(
         )
         )
         rf_handler.setFormatter(formatter)
         rf_handler.setFormatter(formatter)
         logger.addHandler(rf_handler)
         logger.addHandler(rf_handler)
+
+    if is_send_to_logstash:
+        logger.addHandler(logstash.TCPLogstashHandler(setting.LOGSTASH_IP, setting.LOGSTASH_PORT, version=1))
+
     if color and is_write_to_console:
     if color and is_write_to_console:
         loguru_handler = InterceptHandler()
         loguru_handler = InterceptHandler()
         loguru_handler.setFormatter(formatter)
         loguru_handler.setFormatter(formatter)
         # logging.basicConfig(handlers=[loguru_handler], level=0)
         # logging.basicConfig(handlers=[loguru_handler], level=0)
         logger.addHandler(loguru_handler)
         logger.addHandler(loguru_handler)
+
     elif is_write_to_console:
     elif is_write_to_console:
         stream_handler = logging.StreamHandler()
         stream_handler = logging.StreamHandler()
         stream_handler.stream = sys.stdout
         stream_handler.stream = sys.stdout

+ 14 - 8
FworkSpider/feapder/utils/metrics.py

@@ -17,13 +17,16 @@ from feapder.utils.log import log
 class RedisLock:
 class RedisLock:
     redis_cli = None
     redis_cli = None
 
 
-    def __init__(self, key, redis_cli=None, wait_timeout=0, lock_timeout=86400):
+    def __init__(
+        self, key, *, wait_timeout=0, lock_timeout=86400, redis_cli=None, redis_url=None
+    ):
         """
         """
         redis超时锁
         redis超时锁
         :param key: 存储锁的key redis_lock:[key]
         :param key: 存储锁的key redis_lock:[key]
-        :param redis_cli: redis客户端对象
         :param wait_timeout: 等待加锁超时时间,为0时则不等待加锁,加锁失败
         :param wait_timeout: 等待加锁超时时间,为0时则不等待加锁,加锁失败
         :param lock_timeout: 锁超时时间 为0时则不会超时,直到锁释放或意外退出,默认超时为1天
         :param lock_timeout: 锁超时时间 为0时则不会超时,直到锁释放或意外退出,默认超时为1天
+        :param redis_cli: redis客户端对象
+        :param redis_url: redis连接地址,若redis_cli传值,则不使用redis_url
 
 
         用法示例:
         用法示例:
         with RedisLock(key="test") as _lock:
         with RedisLock(key="test") as _lock:
@@ -32,6 +35,7 @@ class RedisLock:
                 # do somethings
                 # do somethings
         """
         """
         self.redis_conn = redis_cli
         self.redis_conn = redis_cli
+        self.redis_url = redis_url
         self.lock_key = "redis_lock:{}".format(key)
         self.lock_key = "redis_lock:{}".format(key)
         # 锁超时时间
         # 锁超时时间
         self.lock_timeout = lock_timeout
         self.lock_timeout = lock_timeout
@@ -43,21 +47,23 @@ class RedisLock:
     @property
     @property
     def redis_conn(self):
     def redis_conn(self):
         if not self.__class__.redis_cli:
         if not self.__class__.redis_cli:
-            self.__class__.redis_cli = RedisDB().get_redis_obj()
+            self.__class__.redis_cli = RedisDB(url=self.redis_url).get_redis_obj()
 
 
         return self.__class__.redis_cli
         return self.__class__.redis_cli
 
 
     @redis_conn.setter
     @redis_conn.setter
     def redis_conn(self, cli):
     def redis_conn(self, cli):
-        self.__class__.redis_cli = cli
+        if cli:
+            self.__class__.redis_cli = cli
 
 
     def __enter__(self):
     def __enter__(self):
         if not self.locked:
         if not self.locked:
             self.acquire()
             self.acquire()
-            # 延长锁的时间
-            thread = threading.Thread(target=self.prolong_life)
-            thread.setDaemon(True)
-            thread.start()
+            if self.locked:
+                # 延长锁的时间
+                thread = threading.Thread(target=self.prolong_life)
+                thread.setDaemon(True)
+                thread.start()
         return self
         return self
 
 
     def __exit__(self, exc_type, exc_val, exc_tb):
     def __exit__(self, exc_type, exc_val, exc_tb):

+ 37 - 32
FworkSpider/feapder/utils/tools.py

@@ -7,6 +7,7 @@ Created on 2018-09-06 14:21
 @author: Boris
 @author: Boris
 @email: boris_liu@foxmail.com
 @email: boris_liu@foxmail.com
 """
 """
+
 import asyncio
 import asyncio
 import calendar
 import calendar
 import codecs
 import codecs
@@ -37,6 +38,7 @@ from pprint import pprint
 from urllib import request
 from urllib import request
 from urllib.parse import urljoin
 from urllib.parse import urljoin
 
 
+import bson
 import execjs  # pip install PyExecJS
 import execjs  # pip install PyExecJS
 import redis
 import redis
 import requests
 import requests
@@ -45,8 +47,10 @@ from requests.cookies import RequestsCookieJar
 from w3lib.url import canonicalize_url as _canonicalize_url
 from w3lib.url import canonicalize_url as _canonicalize_url
 
 
 import feapder.setting as setting
 import feapder.setting as setting
+from feapder.db.redisdb import RedisDB
 from feapder.utils.email_sender import EmailSender
 from feapder.utils.email_sender import EmailSender
 from feapder.utils.log import log
 from feapder.utils.log import log
+
 os.environ["EXECJS_RUNTIME"] = "Node"  # 设置使用node执行js
 os.environ["EXECJS_RUNTIME"] = "Node"  # 设置使用node执行js
 
 
 # 全局取消ssl证书验证
 # 全局取消ssl证书验证
@@ -61,18 +65,11 @@ redisdb = None
 def get_redisdb():
 def get_redisdb():
     global redisdb
     global redisdb
     if not redisdb:
     if not redisdb:
-        ip, port = setting.REDISDB_IP_PORTS.split(":")
-        redisdb = redis.Redis(
-            host=ip,
-            port=port,
-            db=setting.REDISDB_DB,
-            password=setting.REDISDB_USER_PASS,
-            decode_responses=True,
-        )  # redis默认端口是6379
+        redisdb = RedisDB()
     return redisdb
     return redisdb
 
 
 
 
-# 装饰器 -- 单例模式
+# 装饰器
 class Singleton(object):
 class Singleton(object):
     def __init__(self, cls):
     def __init__(self, cls):
         self._cls = cls
         self._cls = cls
@@ -598,20 +595,8 @@ def get_form_data(form):
     return data
     return data
 
 
 
 
-# mac上不好使
-# def get_domain(url):
-#     domain = ''
-#     try:
-#         domain = get_tld(url)
-#     except Exception as e:
-#         log.debug(e)
-#     return domain
-
-
 def get_domain(url):
 def get_domain(url):
-    proto, rest = urllib.parse.splittype(url)
-    domain, rest = urllib.parse.splithost(rest)
-    return domain
+    return urllib.parse.urlparse(url).netloc
 
 
 
 
 def get_index_url(url):
 def get_index_url(url):
@@ -823,27 +808,31 @@ def jsonp2json(jsonp):
         raise ValueError("Invalid Input")
         raise ValueError("Invalid Input")
 
 
 
 
-def dumps_json(json_, indent=4, sort_keys=False):
+def dumps_json(data, indent=4, sort_keys=False):
     """
     """
     @summary: 格式化json 用于打印
     @summary: 格式化json 用于打印
     ---------
     ---------
-    @param json_: json格式的字符串或json对象
+    @param data: json格式的字符串或json对象
     ---------
     ---------
     @result: 格式化后的字符串
     @result: 格式化后的字符串
     """
     """
     try:
     try:
-        if isinstance(json_, str):
-            json_ = get_json(json_)
-
-        json_ = json.dumps(
-            json_, ensure_ascii=False, indent=indent, skipkeys=True, sort_keys=sort_keys
+        if isinstance(data, str):
+            data = get_json(data)
+
+        data = json.dumps(
+            data,
+            ensure_ascii=False,
+            indent=indent,
+            skipkeys=True,
+            sort_keys=sort_keys,
+            default=str,
         )
         )
 
 
     except Exception as e:
     except Exception as e:
-        log.error(e)
-        json_ = pformat(json_)
+        data = pformat(data)
 
 
-    return json_
+    return data
 
 
 
 
 def get_json_value(json_object, key):
 def get_json_value(json_object, key):
@@ -2552,3 +2541,19 @@ def ensure_float(n):
     if not n:
     if not n:
         return 0.0
         return 0.0
     return float(n)
     return float(n)
+
+
+def ensure_int64(n):
+    """
+    >>> ensure_int64(None)
+    0
+    >>> ensure_float(False)
+    0
+    >>> ensure_float(12)
+    12
+    >>> ensure_float("72")
+    72
+    """
+    if not n:
+        return bson.int64.Int64(0)
+    return bson.int64.Int64(n)

+ 167 - 72
FworkSpider/feapder/utils/webdriver.py

@@ -1,20 +1,23 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 """
 """
-Created on 2021/3/18 4:59 下午
+Created on 2023-03-01
 ---------
 ---------
-@summary:
+@summary: 远程selenium服务
 ---------
 ---------
-@author: Boris
-@email: boris_liu@foxmail.com
+@author: dzr
+@email: dongzhaorui@topnet.net.cn
 """
 """
 
 
+import os
 import queue
 import queue
 import threading
 import threading
-import os
+
 from selenium import webdriver
 from selenium import webdriver
-from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+from selenium.webdriver.chrome.remote_connection import ChromeRemoteConnection
+from selenium.webdriver.firefox.remote_connection import FirefoxRemoteConnection
 from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
 from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
 
 
+from feapder.setting import WEBDRIVER
 from feapder.utils.log import log
 from feapder.utils.log import log
 from feapder.utils.tools import Singleton
 from feapder.utils.tools import Singleton
 
 
@@ -22,9 +25,8 @@ DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit
 
 
 
 
 class WebDriver(RemoteWebDriver):
 class WebDriver(RemoteWebDriver):
-    '''浏览器采集 - selenium'''
+    """浏览器采集 - selenium"""
     CHROME = "CHROME"
     CHROME = "CHROME"
-    PHANTOMJS = "PHANTOMJS"
     FIREFOX = "FIREFOX"
     FIREFOX = "FIREFOX"
 
 
     def __init__(
     def __init__(
@@ -32,25 +34,33 @@ class WebDriver(RemoteWebDriver):
         load_images=True,
         load_images=True,
         user_agent=None,
         user_agent=None,
         proxy=None,
         proxy=None,
-        headless=False,
-        driver_type=CHROME,
-        timeout=16,
+        driver_type=FIREFOX,
+        timeout=10,
         window_size=(1024, 800),
         window_size=(1024, 800),
-        executable_path=None,
+        server_addr=None,
         custom_argument=None,
         custom_argument=None,
+        version=None,
+        usages_local_driver=True,
+        headless=False,
+        executable_path=None,
+        service_log_path=None,
         **kwargs
         **kwargs
     ):
     ):
         """
         """
-        webdirver 封装,支持chrome、phantomjs 和 firefox
+        webdirver 封装,支持 chrome 和 firefox
         Args:
         Args:
             load_images: 是否加载图片
             load_images: 是否加载图片
             user_agent: 字符串 或 无参函数,返回值为user_agent
             user_agent: 字符串 或 无参函数,返回值为user_agent
             proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
             proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
             headless: 是否启用无头模式
             headless: 是否启用无头模式
-            driver_type: CHROME 或 PHANTOMJS,FIREFOX
+            driver_type: CHROME 或 FIREFOX...
             timeout: 请求超时时间
             timeout: 请求超时时间
             window_size: # 窗口大小
             window_size: # 窗口大小
             executable_path: 浏览器路径,默认为默认路径
             executable_path: 浏览器路径,默认为默认路径
+            server_addr: 远程服务地址
+            usages_local_driver: 使用本地驱动
+            service_log_path: selenium service 日志路径
+            version: 浏览器版本
             **kwargs:
             **kwargs:
         """
         """
         self._load_images = load_images
         self._load_images = load_images
@@ -59,18 +69,16 @@ class WebDriver(RemoteWebDriver):
         self._headless = headless
         self._headless = headless
         self._timeout = timeout
         self._timeout = timeout
         self._window_size = window_size
         self._window_size = window_size
-        self._executable_path = executable_path
+        self._server_addr = server_addr or WEBDRIVER["server_addr"]
         self._custom_argument = custom_argument
         self._custom_argument = custom_argument
-
-        self.proxies = {}
-        self.user_agent = None
+        self._version = version or WEBDRIVER["version"]
+        self._executable_path = executable_path
+        self._usages_local_driver = usages_local_driver
+        self._service_log_path = service_log_path
 
 
         if driver_type == WebDriver.CHROME:
         if driver_type == WebDriver.CHROME:
             self.driver = self.chrome_driver()
             self.driver = self.chrome_driver()
 
 
-        elif driver_type == WebDriver.PHANTOMJS:
-            self.driver = self.phantomjs_driver()
-
         elif driver_type == WebDriver.FIREFOX:
         elif driver_type == WebDriver.FIREFOX:
             self.driver = self.firefox_driver()
             self.driver = self.firefox_driver()
 
 
@@ -93,30 +101,30 @@ class WebDriver(RemoteWebDriver):
         if exc_val:
         if exc_val:
             log.error(exc_val)
             log.error(exc_val)
 
 
-        self.quit()
-        return True
+        self.get_driver().quit()
+        return False
 
 
     def get_driver(self):
     def get_driver(self):
         return self.driver
         return self.driver
 
 
-    def firefox_driver(self):
+    def local_firefox_driver(self):
         firefox_profile = webdriver.FirefoxProfile()
         firefox_profile = webdriver.FirefoxProfile()
         firefox_options = webdriver.FirefoxOptions()
         firefox_options = webdriver.FirefoxOptions()
         firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
         firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
-        firefox_profile.set_preference("dom.webdriver.enabled",False)
+        firefox_profile.set_preference("dom.webdriver.enabled", False)
         if self._proxy:
         if self._proxy:
             proxy = self._proxy() if callable(self._proxy) else self._proxy
             proxy = self._proxy() if callable(self._proxy) else self._proxy
-            proxy = proxy.replace("socks5://","")
+            proxy = proxy.replace("socks5://", "")
             # 使用socks5 代理
             # 使用socks5 代理
             firefox_profile.set_preference('network.proxy.type', 1)  # 不使用代理:0, 使用代理:1
             firefox_profile.set_preference('network.proxy.type', 1)  # 不使用代理:0, 使用代理:1
             firefox_profile.set_preference('network.proxy.socks', proxy.split(":")[0])
             firefox_profile.set_preference('network.proxy.socks', proxy.split(":")[0])
             firefox_profile.set_preference('network.proxy.socks_port', int(proxy.split(":")[-1]))
             firefox_profile.set_preference('network.proxy.socks_port', int(proxy.split(":")[-1]))
-            # firefox_capabilities["marionette"] = True  # http代理的使用
 
 
         if self._user_agent:
         if self._user_agent:
             firefox_profile.set_preference(
             firefox_profile.set_preference(
                 "general.useragent.override",
                 "general.useragent.override",
-                self._user_agent() if callable(self._user_agent) else self._user_agent,
+                self._user_agent() if callable(
+                    self._user_agent) else self._user_agent,
             )
             )
 
 
         if not self._load_images:
         if not self._load_images:
@@ -137,12 +145,14 @@ class WebDriver(RemoteWebDriver):
                 options=firefox_options,
                 options=firefox_options,
                 firefox_profile=firefox_profile,
                 firefox_profile=firefox_profile,
                 executable_path=self._executable_path,
                 executable_path=self._executable_path,
+                service_log_path=self._service_log_path
             )
             )
         else:
         else:
             driver = webdriver.Firefox(
             driver = webdriver.Firefox(
                 capabilities=firefox_capabilities,
                 capabilities=firefox_capabilities,
                 options=firefox_options,
                 options=firefox_options,
                 firefox_profile=firefox_profile,
                 firefox_profile=firefox_profile,
+                service_log_path=self._service_log_path
             )
             )
 
 
         if self._window_size:
         if self._window_size:
@@ -150,20 +160,73 @@ class WebDriver(RemoteWebDriver):
 
 
         return driver
         return driver
 
 
-    def chrome_driver(self):
+    def remote_firefox_driver(self):
+        firefox_capabilities = {
+            "browserName": "firefox",
+            "platform": "ANY",
+            "version": self._version,
+            "javascriptEnabled": True,
+            "marionette": False,
+        }
+        firefox_options = webdriver.FirefoxOptions()
+        firefox_options.add_argument("--disable-gpu")
+        firefox_options.set_preference("dom.webdriver.enabled", False)
+        if self._proxy:
+            proxy = self._proxy() if callable(self._proxy) else self._proxy
+            proxy = proxy.replace("socks5://", "")
+            # 使用socks5 代理
+            ip, port = proxy.split(":")
+            firefox_options.set_preference('network.proxy.type', 1)  # 不使用代理:0, 使用代理:1
+            firefox_options.set_preference('network.proxy.socks', ip)
+            firefox_options.set_preference('network.proxy.socks_port', int(port))
+            # firefox_capabilities["marionette"] = True  # http代理的使用
+
+        if self._user_agent:
+            firefox_options.set_preference(
+                "general.useragent.override",
+                self._user_agent() if callable(self._user_agent) else self._user_agent,
+            )
+
+        if not self._load_images:
+            firefox_options.set_preference("permissions.default.image", 2)
+
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                firefox_options.add_argument(arg)
+
+        executor = FirefoxRemoteConnection(remote_server_addr=self._server_addr)
+        browser = webdriver.Remote(
+            command_executor=executor,
+            desired_capabilities=firefox_capabilities,
+            options=firefox_options
+        )
+
+        if self._window_size:
+            browser.set_window_size(*self._window_size)
+
+        return browser
+
+    def firefox_driver(self):
+        if self._usages_local_driver:
+            return self.local_firefox_driver()
+        return self.remote_firefox_driver()
+
+    def remote_chrome_driver(self):
+        chrome_capabilities = {
+            "browserName": "chrome",
+            "platform": "ANY",
+            "version": self._version,
+            "javascriptEnabled": True,
+        }
         chrome_options = webdriver.ChromeOptions()
         chrome_options = webdriver.ChromeOptions()
+
         # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
         # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
         chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
         chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
         chrome_options.add_experimental_option("useAutomationExtension", False)
         chrome_options.add_experimental_option("useAutomationExtension", False)
         # docker 里运行需要
         # docker 里运行需要
         chrome_options.add_argument("--no-sandbox")
         chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-gpu")
 
 
-        if self._proxy:
-            chrome_options.add_argument(
-                "--proxy-server={}".format(
-                    self._proxy() if callable(self._proxy) else self._proxy
-                )
-            )
         if self._user_agent:
         if self._user_agent:
             chrome_options.add_argument(
             chrome_options.add_argument(
                 "user-agent={}".format(
                 "user-agent={}".format(
@@ -172,15 +235,19 @@ class WebDriver(RemoteWebDriver):
                     else self._user_agent
                     else self._user_agent
                 )
                 )
             )
             )
+        # 不支持socks5协议
+        # if self._proxy:
+        #     chrome_options.add_argument(
+        #         "--proxy-server={}".format(
+        #             self._proxy() if callable(self._proxy) else self._proxy
+        #         )
+        #     )
+
         if not self._load_images:
         if not self._load_images:
             chrome_options.add_experimental_option(
             chrome_options.add_experimental_option(
                 "prefs", {"profile.managed_default_content_settings.images": 2}
                 "prefs", {"profile.managed_default_content_settings.images": 2}
             )
             )
 
 
-        if self._headless:
-            chrome_options.add_argument("--headless")
-            chrome_options.add_argument("--disable-gpu")
-
         if self._window_size:
         if self._window_size:
             chrome_options.add_argument(
             chrome_options.add_argument(
                 "--window-size={},{}".format(self._window_size[0], self._window_size[1])
                 "--window-size={},{}".format(self._window_size[0], self._window_size[1])
@@ -191,68 +258,95 @@ class WebDriver(RemoteWebDriver):
             for arg in self._custom_argument:
             for arg in self._custom_argument:
                 chrome_options.add_argument(arg)
                 chrome_options.add_argument(arg)
 
 
-        if self._executable_path:
-            driver = webdriver.Chrome(
-                chrome_options=chrome_options, executable_path=self._executable_path
-            )
-        else:
-            driver = webdriver.Chrome(chrome_options=chrome_options)
+        browser = webdriver.Remote(
+            command_executor=ChromeRemoteConnection(
+                remote_server_addr=self._server_addr,
+                keep_alive=True),
+            desired_capabilities=chrome_capabilities,
+            options=chrome_options
+        )
 
 
         # 隐藏浏览器特征
         # 隐藏浏览器特征
         with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
         with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
             js = f.read()
             js = f.read()
-        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
-
-        return driver
-
-    def phantomjs_driver(self):
-        import warnings
+            params = {
+                'cmd': 'Page.addScriptToEvaluateOnNewDocument',
+                'params': {'source': js}
+            }
+            res = browser.execute("executeCdpCommand", params)['value']
 
 
-        warnings.filterwarnings("ignore")
+        return browser
 
 
-        service_args = []
-        dcap = DesiredCapabilities.PHANTOMJS
+    def local_chrome_driver(self):
+        chrome_options = webdriver.ChromeOptions()
+        # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
+        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+        chrome_options.add_experimental_option("useAutomationExtension", False)
+        # docker 里运行需要
+        chrome_options.add_argument("--no-sandbox")
 
 
         if self._proxy:
         if self._proxy:
-            service_args.append(
-                "--proxy=%s" % self._proxy() if callable(self._proxy) else self._proxy
+            chrome_options.add_argument(
+                "--proxy-server={}".format(
+                    self._proxy() if callable(self._proxy) else self._proxy
+                )
             )
             )
         if self._user_agent:
         if self._user_agent:
-            dcap["phantomjs.page.settings.userAgent"] = (
-                self._user_agent() if callable(self._user_agent) else self._user_agent
+            chrome_options.add_argument(
+                "user-agent={}".format(
+                    self._user_agent()
+                    if callable(self._user_agent)
+                    else self._user_agent
+                )
             )
             )
         if not self._load_images:
         if not self._load_images:
-            service_args.append("--load-images=no")
+            chrome_options.add_experimental_option(
+                "prefs", {"profile.managed_default_content_settings.images": 2}
+            )
+
+        if self._headless:
+            chrome_options.add_argument("--headless")
+            chrome_options.add_argument("--disable-gpu")
+
+        if self._window_size:
+            chrome_options.add_argument(
+                "--window-size={},{}".format(self._window_size[0], self._window_size[1])
+            )
 
 
         # 添加自定义的配置参数
         # 添加自定义的配置参数
         if self._custom_argument:
         if self._custom_argument:
             for arg in self._custom_argument:
             for arg in self._custom_argument:
-                service_args.append(arg)
+                chrome_options.add_argument(arg)
 
 
         if self._executable_path:
         if self._executable_path:
-            driver = webdriver.PhantomJS(
-                service_args=service_args,
-                desired_capabilities=dcap,
+            driver = webdriver.Chrome(
+                chrome_options=chrome_options,
                 executable_path=self._executable_path,
                 executable_path=self._executable_path,
+                service_log_path=self._service_log_path
             )
             )
         else:
         else:
-            driver = webdriver.PhantomJS(
-                service_args=service_args, desired_capabilities=dcap
+            driver = webdriver.Chrome(
+                chrome_options=chrome_options,
+                service_log_path=self._service_log_path
             )
             )
 
 
-        if self._window_size:
-            driver.set_window_size(self._window_size[0], self._window_size[1])
-
-        del warnings
+        # 隐藏浏览器特征
+        with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
+            js = f.read()
+        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
 
 
         return driver
         return driver
 
 
+    def chrome_driver(self):
+        if self._usages_local_driver:
+            return self.local_chrome_driver()
+        return self.remote_chrome_driver()
+
     @property
     @property
     def cookies(self):
     def cookies(self):
         cookies_json = {}
         cookies_json = {}
         for cookie in self.driver.get_cookies():
         for cookie in self.driver.get_cookies():
             cookies_json[cookie["name"]] = cookie["value"]
             cookies_json[cookie["name"]] = cookie["value"]
-
         return cookies_json
         return cookies_json
 
 
     @cookies.setter
     @cookies.setter
@@ -274,8 +368,9 @@ class WebDriver(RemoteWebDriver):
         else:
         else:
             raise AttributeError
             raise AttributeError
 
 
-    def __del__(self):
-        self.quit()
+    # def __del__(self):
+    #     if self.driver:
+    #         self.driver.quit()
 
 
 
 
 @Singleton
 @Singleton

+ 71 - 53
FworkSpider/items/__init__.py

@@ -1,27 +1,37 @@
-from feapder import Item
-from untils.tools import int2long,substitute,text_search,CheckPrePareRequest
-import time
+import feapder.utils.tools as tools
 from feapder.utils.log import log
 from feapder.utils.log import log
-global xxc
-xxc = 0
-class DataNjpcItem(Item):
+from items.base_item import SwordFishProjectItem
+from untils.check_data import CheckData
+from untils.tools import int2long, substitute, text_search
+
+
+class DataNjpcItem(SwordFishProjectItem):
+    """拟建数据"""
     def __init__(self):
     def __init__(self):
-        # 一类字段
-        self.href = ""  # 非竞品快照页地址
-        self.projectname = ""  # 项目名称
-        self.publishtime = ""  # 文章发布时间(日期格式 xxxx-xx-xx)
-        self.detail = ""  # 快照页源码清洗之后招投标文本
-        self.contentlhtml = ""  # 快照页源码
+        super(DataNjpcItem, self).__init__()
+
+        self.table_name = "data_bak"  # 拟建数据存储表名
+
         self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
         self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
         self.site = ""  # 采集的站点(编辑器爬虫平台定义)
         self.site = ""  # 采集的站点(编辑器爬虫平台定义)
         self.channel = ""  # 采集的版块(编辑器爬虫平台定义)
         self.channel = ""  # 采集的版块(编辑器爬虫平台定义)
+
+        # 一类字段
+        self.href = ""  # 非竞品详情页地址
+        self.title = ""  # 标题
+        self.projectname = ""  # 项目名称
+        self.publishtime = ""  # 文章发布时间(时间戳),单位:秒
         self.area = "全国"  # 省
         self.area = "全国"  # 省
         self.city = ""  # 市
         self.city = ""  # 市
-        self.district = ""  # 区县
+        self.district = ""  # 区/县
+        self.contenthtml = ""  # 详情页源码
+        self.detail = ""  # 详情页源码清洗之后的文本
+        self.projectinfo = None  # 附件信息,详见剑鱼拟建规范
 
 
-        # 辅助字段 存储时的辅助字段
-        self.save = True  # 区县
-        self.sendflag = False
+        # 默认设置
+        self.sendflag = "false"
+        self.T = "bidding"
+        self.infoformat = 2
 
 
         # 以下字段为 二类字段,没有则不做存储,不在存储结构中
         # 以下字段为 二类字段,没有则不做存储,不在存储结构中
         # 附件,默认为Null 正确的格式为 projectinfo.attachments = [{
         # 附件,默认为Null 正确的格式为 projectinfo.attachments = [{
@@ -79,60 +89,68 @@ class DataNjpcItem(Item):
         # 施工单位联系人	constructionunitperson
         # 施工单位联系人	constructionunitperson
         # 施工单位联系方式	constructionunittel
         # 施工单位联系方式	constructionunittel
         # 施工单位地址	constructionunitaddr
         # 施工单位地址	constructionunitaddr
+
     def pre_to_db(self):
     def pre_to_db(self):
-        # 生成入库时间戳(秒级), 定义为long型
-        self.comeintime = int2long(time.time())
-        # 根据文章发布时间 生成发布时间的时间戳(秒级), 定义为long型
-        '''
-        如果无法解析到发布时间、可以考虑补一个发布时间
-        '''
-        # if "-" in self.publishtime:
-        #     self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d %H:%M:%S"))))
-        # else:
-        #     self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d"))))
+        if not self.title:
+            self.title = self.projectname
+            log.debug("请检测 < title > 是否正确!")
+
+        self.comeintime = int2long(tools.get_current_timestamp())  # 生成入库时间戳(秒级)
 
 
         if "-" in str(self.publishtime) and ":" in str(self.publishtime):
         if "-" in str(self.publishtime) and ":" in str(self.publishtime):
-            self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d %H:%M:%S"))))
+            self.publishtime = int2long(tools.date_to_timestamp(self.publishtime))
         elif "-" in str(self.publishtime) and ":" not in str(self.publishtime):
         elif "-" in str(self.publishtime) and ":" not in str(self.publishtime):
-            self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d"))))
-        elif len(str(self.publishtime)) == 10 or len(str(self.publishtime)) == 13:
+            self.publishtime = int2long(tools.date_to_timestamp(self.publishtime, "%Y-%m-%d"))
+        elif len(str(self.publishtime)) == 10 or len(str(self.publishtime)) == 13: # 或许是时间戳
             self.publishtime = int2long(int(str(self.publishtime)[:10]))
             self.publishtime = int2long(int(str(self.publishtime)[:10]))
         else:
         else:
-            raise ValueError("The publication time format is incorrect -> %r " %(self.publishtime))
+            raise ValueError("发布时间格式不正确 -> %r " %(self.publishtime))
 
 
-        # 数据获取失败处理:输出错误日志
         if not self.projectname or not self.publishtime or not self.href:
         if not self.projectname or not self.publishtime or not self.href:
-            log.error(f"部分数据抓取失败,数据详情:\n 链接:{self.href}\n 发布时间:{self.publishtime}\n标题:{self.projectname}")
-            self.save=False
-        if self.contentlhtml is not None and self.detail =='':
-            self.detail = substitute(self.contentlhtml)
-            '''
-            detail:去头、去尾
-            '''
+            self.save = False
+            log.warning(f"基础数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.projectname}")
+
+        if not self.contenthtml:
+            self.save = False
+            log.warning(f"正文数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.projectname}")
+        else:
+            if not self.detail:
+                self.detail = substitute(self.contenthtml)
+
             if text_search(self.detail).total == 0:
             if text_search(self.detail).total == 0:
-                # 无正文内容时,该内容直接标记true, 不在被统计、不入生产库
                 self.sendflag = "true"
                 self.sendflag = "true"
 
 
-class NjpcListItem(Item):
+        if not self.projectinfo:
+            del self.projectinfo
+
+
+class NjpcListItem(SwordFishProjectItem):
+
     def __init__(self):
     def __init__(self):
-        # 一类字段
-        self.href = ""  # 非竞品快照页地址
-        self.projectname = ""  # 项目名称
-        self.publishtime = ""  # 文章发布时间(日期格式 xxxx-xx-xx)
+        super(NjpcListItem, self).__init__()
+
         self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
         self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
         self.site = ""  # 采集的站点(编辑器爬虫平台定义)
         self.site = ""  # 采集的站点(编辑器爬虫平台定义)
         self.channel = ""  # 采集的版块(编辑器爬虫平台定义)
         self.channel = ""  # 采集的版块(编辑器爬虫平台定义)
+
+        self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识,建议使用 spidercode 命名
+        self.parser_url = ""  # 详情页数据地址
+
+        self.comeintime = int2long(tools.get_current_timestamp())  # 入库时间
+
+        # 一类字段
+        self.href = ""  # 非竞品详情页地址
+        self.projectname = ""  # 项目名称
+        self.publishtime = ""  # 文章发布时间
         self.area = "全国"  # 省
         self.area = "全国"  # 省
         self.city = ""  # 市
         self.city = ""  # 市
-        self.district = ""  # 区县
-
-        # 辅助字段 存储时的辅助字段
-        self.save = True  # 区县
-        self.parser_name = ""  # 处理详情页爬虫的名称
-        self.parser_url = ""  # 处理详情页的url
-        self.failed = 0 #失败请求的计数
+        self.district = ""  # 区/县
 
 
+        self.request_params = {}  # 定义callback所需的参数,诸如render,headers,method,data,params等等,必须与requests请求的参数名称对应,否则无法识别
 
 
     def pre_to_db(self):
     def pre_to_db(self):
-        pass
-
+        if CheckData.channel(self.channel, group="njpc"):
+            code, reason = CheckData.title(self.projectname, group="njpc")
+            if code == 10106:
+                log.warning(f"{self.projectname}--不可入库,原因:{reason}")
+                self.save = False

+ 115 - 111
FworkSpider/items/spider_item.py

@@ -1,140 +1,144 @@
-from feapder import Item
-from untils.tools import int2long, substitute, text_search, CheckPrePareRequest, HtmlEmptyError
-import time
+import feapder.utils.tools as tools
 from feapder.utils.log import log
 from feapder.utils.log import log
-from feapder.utils.tools import get_current_date
-from datetime import datetime
-import os
-from feapder import setting
-global xxc
-xxc = 0
+from items.base_item import SwordFishProjectItem
+from untils.check_data import CheckData
+from untils.tools import (
+    int2long,
+    substitute,
+    text_search,
+)
 
 
-class DataBakItem(Item):
 
 
+class DataBakItem(SwordFishProjectItem):
+    """标讯数据"""
     def __init__(self):
     def __init__(self):
+        super(DataBakItem, self).__init__()
+
+        self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
+        self.site = ""  # 采集的站点(编辑器爬虫平台定义)
+        self.channel = ""  # 采集的版块(编辑器爬虫平台定义)
+
         self.title = ""  # 文章标题
         self.title = ""  # 文章标题
-        self.publishtime = ""   # 文章发布时间(日期格式 xxxx-xx-xx)
-        self.spidercode = ""   # 爬虫代码(编辑器爬虫平台定义)
-        self.site = ""   # 采集的站点(编辑器爬虫平台定义)
-        self.channel = ""   # 采集的版块(编辑器爬虫平台定义)
-        self.area = "全国"   # 省
-        self.city = ""   # 市
-        self.competehref = None   # 竞品快照页地址
-        self.href = ""   # 非竞品快照页地址
-        self.publishdept = ""
-        self.iscompete=True
-        self.type = ""
-        self.T = "bidding"
+        self.s_title = ""  # 详情页标题(有必填),默认提供列表页标题
+        self.area = "全国"  # 省
+        self.city = ""  # 市
+        self.district = ""  # 区/县
+        self.publishtime = ""  # 文章发布时间(列表页或者详情页发布时间)
         self.l_np_publishtime = ""  # 发布时间的时间戳(秒级), 需定义为long型
         self.l_np_publishtime = ""  # 发布时间的时间戳(秒级), 需定义为long型
         self.comeintime = ""  # 入库时间戳(秒级), 需定义为long型
         self.comeintime = ""  # 入库时间戳(秒级), 需定义为long型
+        self.contenthtml = ""  # 详情页源码
+        self.detail = ""  # 详情页源码清洗之后的文本
+
+        self.href = ""  # 非竞品详情页地址
+        self.competehref = None  # 竞品详情页地址
+        self.projectinfo = None  # 附件信息,详见剑鱼招投标规范
+
+        self.iscompete = True  # 新爬虫
+
         self.sendflag = "false"
         self.sendflag = "false"
+        self.T = "bidding"
+        self.infoformat = 1
+
+        # 默认设置
+        self.type = ""
+        self.publishdept = ""
         self._d = "comeintime"
         self._d = "comeintime"
-        self.contenthtml = ""  # 快照页源码
-        self.detail = ""  # 快照页源码清洗之后招投标文本
-        self.projectinfo = None  # 快照页源码清洗之后招投标文本
-        self.save = True
-    def stop(self):
-        self.save=False
-        raise HtmlEmptyError
 
 
     def pre_to_db(self):
     def pre_to_db(self):
-        # 生成入库时间戳(秒级), 定义为long型
-        self.comeintime = int2long(time.time())
-        # 根据文章发布时间 生成发布时间的时间戳(秒级), 定义为long型
-        '''如果无法解析到发布时间、可以考虑补一个发布时间
-        '''
+        if not self.s_title:
+            self.s_title = self.title
+            log.debug("请检测 < s_title > 是否正确!")
+
+        self.comeintime = int2long(tools.get_current_timestamp())  # 生成入库时间戳(秒级), 定义为long型
+
         if ":" in self.publishtime:
         if ":" in self.publishtime:
-            self.l_np_publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d %H:%M:%S"))))
+            self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime))
         else:
         else:
-            self.l_np_publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d"))))
+            self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime, "%Y-%m-%d"))
 
 
-        # 数据获取失败处理:输出错误日志
-        if self.contenthtml is None and self.projectinfo is None:
-            log.error(f"{self.href},此链接数据正文抓取失败")
-            # self.sendflag = "true"
-            self.stop()
-        if not self.title or not self.publishtime or not self.href:
-            # self.sendflag = "true"
-            log.error(f"部分数据抓取失败,数据详情:\n 链接:{self.href}\n 发布时间:{self.publishtime}\n标题:{self.title}")
-            self.stop()
         # html处理正文
         # html处理正文
-        if self.contenthtml is not None and self.detail =='':
-            self.detail = substitute(self.contenthtml)
-            '''
-            detail:去头、去尾
-            '''
+        if not self.contenthtml:
+            log.warning(f"正文数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.title}")
+            self.save = False
+        else:
+            if not self.detail:
+                self.detail = substitute(self.contenthtml)
+
             if text_search(self.detail).total == 0:
             if text_search(self.detail).total == 0:
-                # 无正文内容时,该内容直接标记true, 不在被统计
-                self.sendflag = "true"
+                self.sendflag = "true"   # 无内容数据,数据不入保存服务
+
+        if not self.title or not self.publishtime or not self.href:
+            log.warning(f"基础数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.title}")
+            self.save = False
+
+        # 竞品网站-详情页地址标识字段
+        if not self.competehref:
+            del self.competehref
 
 
+        # 详情无附件,不需要 projectinfo 字段
+        if not self.projectinfo:
+            del self.projectinfo
 
 
-class MgpListItem(Item):
+
+class ExamineAndApproveItem(DataBakItem):
+    """审批数据"""
     def __init__(self):
     def __init__(self):
-        # self.__table_name__='ggg_list'
-
-        self.parse = "" # 需要调用的方法名称
-        self.item = "" # 传过来的参数
-        self.parser_name = "" # 处理详情页的爬虫名
-        self.date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 当前日期时间
-        self.comeintime = int2long(int(time.time())) # 当前日期时间戳
-        self.deal_detail = [] # 定义解析详情页主页内容的解析,detail_get是一个xpath列表,detail_post 则是一段处理代码
-        self.create_time = None # 定义解析详情页发布时间的xpath,列表页无发布时间时应用
-        self.parse_url = "" # 定义解析详情页主页内容的xpath
-        self.request_params = {} # 定义callback所需的参数,诸如render,headers,method,data,params等等,
-                                # 必须与requests请求的参数名称对应,否则无法识别
-        self.failed = 0 #失败请求的计数
-        self.author = "开发及维护人员" # 开发及维护人员
-        self.ex_js = ''  # 定义需要执行的python代码时所需的参数、js_str、js文件路径 等
-        self.ex_python = None # 定义需要执行的python代码,生成params/date,如header和cookie特殊,最好使用特殊定义法
-        self.pri = 1 # 爬虫报警级 可分9级
-        self.proxies = True # 爬虫报警级 可分9级
-        self.files = False # 附件采集配置
-        self.error = None
-        self.spidercode = ""
-        self.save=True
-
-        # self.error_info =
-    def pre_to_db(self):
-        # 生成入库时间戳(秒级), 定义为long型
-        self.author = os.path.basename(os.getcwd())
-        self.spidercode = self.item.get("spidercode")
+        super(ExamineAndApproveItem, self).__init__()
 
 
-        if "通知公告" in self.item.get("channel"):
-            code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
-            if code == 10106:
-                log.error(f"{self.item.get('title')}----不可入库,失败原因:{reason}")
-        elif "公告公示" in self.item.get("channel"):
-            code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
-            if code == 10106:
-                log.error(f"{self.item.get('title')}----不可入库,失败原因:{reason}")
+        self.table_name = "data_bak"
+
+        self.T = "bidding"
+        self.infoformat = 2
+
+
+class PropertyRightItem(DataBakItem):
+    """产权数据"""
+    def __init__(self):
+        super(PropertyRightItem, self).__init__()
+
+        self.table_name = "data_bak"
+
+        self.T = "bidding_other"
+        self.infoformat = 3
 
 
-        global xxc
-        xxc += 1
 
 
-    def open_spider(self):
-        pass
+class MgpListItem(SwordFishProjectItem):
 
 
-class ListItem(Item):
     def __init__(self):
     def __init__(self):
+        super(MgpListItem, self).__init__()
+
         self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
         self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
-        self.site = ""  # 采集的站点(编辑器爬虫平台定义)
-        self.channel = ""  # 采集的版块(编辑器爬虫平台定义)
-        self.url = ''
-        self.count=0
-        self.code=-1
-        self.rel_count = 0
-        self.save=True
 
 
-    def pre_to_db(self):
-        time.sleep(0.1)
-        self.author = setting.author.get(os.path.basename(os.getcwd()))
-        if self.author is None:
-            self.author = os.path.basename(os.getcwd())
-        self.runtime = get_current_date(date_format="%Y-%m-%d")
-        global xxc
-        print("xxc___________________",xxc)
-        self.rel_count = xxc
-        xxc = 0
+        self.parse_url = ""  # 详情爬虫访问地址
+        self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识,建议使用 spidercode 命名
+        self.parse = ""  # 详情爬虫解析回调方法名
+
+        self.request_params = {}  # 定义callback所需的参数,诸如render,headers,method,data,params等等,必须与requests请求的参数名称对应,否则无法识别
+        self.proxies = True  # 代理
+
+        self.comeintime = int2long(tools.get_current_timestamp())  # 入库时间
 
 
+        self.deal_detail = []  # 定义解析详情页主页内容的xpath列表
+        self.ex_js = ""  # 定义需要执行的js代码,包括但不限于script、文件路径等
+        self.ex_python = None  # 定义需要执行的python代码,生成params/date,如header和cookie特殊,最好使用特殊定义法
 
 
+        self.files = False  # 采集附件配置
 
 
+    @property
+    def item(self) -> dict:
+        return self.__dict__["item"]
+
+    @item.setter
+    def item(self, data_item: DataBakItem):
+        self.__dict__["item"] = data_item.to_dict
+
+    def pre_to_db(self):
+        self.spidercode = self.item["spidercode"]
+
+        title = self.item.get("title")
+        channel = self.item["channel"]
+        if CheckData.channel(channel):
+            code, reason = CheckData.title(title)
+            if code == 10106:
+                log.warning(f"{title}--不可入库,原因:{reason}")
+                self.save = False

+ 0 - 0
FworkSpider/login_pool/__init__.py


+ 0 - 95
FworkSpider/login_pool/zglbw.py

@@ -1,95 +0,0 @@
-
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-from untils.cookie_pool import LoginCookiePool
-import requests
-class ZglbwPool(LoginCookiePool):
-
-    def create_cookie(self, username, password):
-        print(username,password)
-        '''
-        https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
-        2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
-        
-        https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
-        2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
-        '''
-        session = requests.Session()
-        headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0"}
-        url = 'https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=9d424669-5af6-4b3d-bed5-56cc06bd5ca6'
-        data = {
-            "clear": "",
-            "BackURL": "null",
-            "username": username,
-            "password": password,
-            "jcaptchaCode": "shmt"
-        }
-        session.get(url,headers=headers)
-        session.post(url, data=data)
-        # print(res.headers)
-        ss = session.get(url='https://eproport.crecgec.com/getAuthentication')
-        print(ss.text)
-        cookies = requests.utils.dict_from_cookiejar(session.cookies)
-        print(cookies)
-        return cookies
-
-
-
-
-# cookie_pool = ZglbwPool(username_key='username', password_key="password", table_userbase='zglbw',
-#                               redis_key='zglbw')
-# # cookie_pool.create_cookie('zuoshang123',"123qwe!A")
-# # # res = requests.get('https://eproport.crecgec.com/getAuthentication',cookies=cookie)
-# # # print(res.text)
-# cookie_pool.del_cookie(cookie_pool.get_cookie())
-
-
-# def create_cookie():
-#     '''
-#     https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
-#     2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
-#
-#     https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
-#     2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
-#     '''
-#     session = requests.Session()
-#     url = 'https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&response_type=code'
-#     data = {
-#         "clear": "",
-#         "BackURL": "null",
-#         "username": "zuoshang123",
-#         "password": "123qwe!A",
-#         "jcaptchaCode": "shmt"
-#     }
-#     session.get(url)
-#     res = session.post(url, data=data)
-#
-# create_cookie()
-# # import requests
-#
-#
-#
-# # cookies = {
-# #     "srv_id": "53069e9fd596ee2f1c7cf21d24bd170e",
-# #     "uid": "e423da7f-1d30-4571-a011-429326f1cfd1",
-# #     "Hm_lvt_89c053c39b2269b8a37c5881ca224223": "1642647201",
-# #     "JSESSIONID": "752173C3FF0C519DB45BBF781CEC76CB",
-# #     "Hm_lpvt_89c053c39b2269b8a37c5881ca224223": "1642661696"
-# # }
-# # url = "https://passport.crecgec.com/authorize"
-# # params = {
-# #     "type": "cas",
-# #     "client_id": "10000000`53",
-# #     "response_type": "code"
-# # }
-# # data = {
-# #     "clear": "",
-# #     "BackURL": "null",
-# #     "username": "zuoshang123",
-# #     "password": "123qwe!A",
-# #     "jcaptchaCode": "shmt"
-# # }
-# # response = requests.post(url, headers=headers, cookies=cookies, params=params, data=data)
-# #
-# # print(response.text)
-# # print(response)

+ 0 - 56
FworkSpider/mongo_pipeline.py

@@ -1,56 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-04-18 14:12:21
----------
-@summary: 导出数据
----------
-@author: 马国鹏
-@email:  305021384@qq.com
-"""
-from typing import Dict, List, Tuple
-import time
-from feapder.db.redisdb import RedisDB
-from feapder.dedup import Dedup
-from feapder.pipelines import BasePipeline
-from feapder.utils.log import log
-from untils.tools import *
-
-
-
-class RedisPipeline(BasePipeline):
-    '''数据存储管道-redis版'''
-    def __init__(self):
-        self._to_db = None
-
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = RedisDB()
-            print("创建新连接?")
-
-        return self._to_db
-
-    def save_items(self, table, items: List[Dict]) -> bool:
-        """
-        保存数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-
-        Returns: 是否保存成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-        """
-        try:
-            add_count = self.to_db.lpush(table="savemongo:"+table, values=items)
-            print(add_count)
-            datas_size = len(items)
-            log.info(
-                "共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条"
-                % (datas_size, table, len(items), datas_size - len(items))
-            )
-
-            return True
-        except Exception as e:
-            log.exception(e)
-            return False
-

+ 0 - 98
FworkSpider/mongo_pipeline_old.py

@@ -1,98 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-04-18 14:12:21
----------
-@summary: 导出数据
----------
-@author: 马国鹏
-@email:  305021384@qq.com
-"""
-from typing import Dict, List, Tuple
-import time
-from feapder.db.mongodb import MongoDB
-from feapder.dedup import Dedup
-from feapder.pipelines import BasePipeline
-from feapder.utils.log import log
-from untils.tools import *
-# from crawlab import save_item
-
-
-
-class MongoPipeline(BasePipeline):
-    def __init__(self):
-        self._to_db = None
-
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-            print("创建新连接?")
-
-        return self._to_db
-
-    def save_items(self, table, items: List[Dict]) -> bool:
-        """
-        保存数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-
-        Returns: 是否保存成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-        """
-        try:
-            add_count = self.to_db.add_batch(coll_name=table, datas=items)
-            for item in items:
-                dedup = Dedup(Dedup.BloomFilter)
-                dedup.add([item.get("href")])
-                # save_item({'count':item.get("href")})
-            datas_size = len(items)
-            log.info(
-                "共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条"
-                % (datas_size, table, add_count, datas_size - add_count)
-            )
-            # wechat_warning(f"{site}  数据导报\n共插入 {datas_size} 条数据到 {table}")
-            # for i in range(add_count):
-            # if table == "mgp_list":
-            #     save_item({"site": "失败回填", "title": add_count})
-
-            return True
-        except Exception as e:
-            log.exception(e)
-            return False
-
-    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
-        """
-        更新数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-            update_keys: 更新的字段, 如 ("title", "publish_time")
-
-        Returns: 是否更新成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-
-        """
-        try:
-            # self.to_db.find()
-            add_count = self.to_db.add_batch(
-                coll_name=table,
-                datas=items,
-                update_columns=update_keys or list(items[0].keys()),
-            )
-            datas_size = len(items)
-            update_count = datas_size - add_count
-            msg = "共导出 %s 条数据到 %s,  新增 %s 条, 更新 %s 条" % (
-                datas_size,
-                table,
-                add_count,
-                update_count,
-            )
-            if update_keys:
-                msg += " 更新字段为 {}".format(update_keys)
-            log.info(msg)
-
-            return True
-        except Exception as e:
-            log.exception(e)
-            return False

+ 89 - 151
FworkSpider/setting.py

@@ -1,181 +1,119 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 """爬虫配置文件"""
 """爬虫配置文件"""
+import datetime
 import os
 import os
-import time
 import sys
 import sys
-# from scoket_proxy import Socks5Proxy
-#
-# # MYSQL
-# MYSQL_IP = "localhost"
-# MYSQL_PORT = 3306
-# MYSQL_DB = ""
-# MYSQL_USER_NAME = ""
-# MYSQL_USER_PASS = ""
-#
+
 # MONGODB
 # MONGODB
-# MONGO_IP = "192.168.20.51"  # 本地 docker 环境
-MONGO_IP = "172.17.4.87"  # 线上环境
+MONGO_IP = "172.17.4.87"
 MONGO_PORT = 27080
 MONGO_PORT = 27080
-# MONGO_PORT = 27001
 MONGO_DB = "py_spider"
 MONGO_DB = "py_spider"
-# MONGO_USER_NAME = ""
-# MONGO_USER_PASS = ""
-#
-# # REDIS
-# # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
-# REDISDB_IP_PORTS = "192.168.20.51:6379"  # 本地 docker 环境
-REDISDB_IP_PORTS = "172.19.0.1:6379"  # 环境
-# REDISDB_USER_PASS = ""
+
+# REDIS
+# ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
+REDISDB_IP_PORTS = "172.17.4.232:7361"
+REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ"
 REDISDB_DB = 10
 REDISDB_DB = 10
-# # 适用于redis哨兵模式
-REDISDB_SERVICE_NAME = "quchoong"  # 没用到
-#
-# # 数据入库的pipeline,可自定义,默认MysqlPipeline
+
+# 数据入库的pipeline,可自定义,默认RedisPipeline
 ITEM_PIPELINES = [
 ITEM_PIPELINES = [
-    # "feapder.pipelines.mysql_pipeline.MysqlPipeline",
     # "feapder.pipelines.mongo_pipeline.MongoPipeline",
     # "feapder.pipelines.mongo_pipeline.MongoPipeline",
-    "mongo_pipeline.MongoPipeline"
+    "feapder.pipelines.swordfish.redis_pipeline.RedisPipeline"
 ]
 ]
-EXPORT_DATA_MAX_FAILED_TIMES = 5 # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
-EXPORT_DATA_MAX_RETRY_TIMES = 5 # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
-#
-# # 爬虫相关
-# # COLLECTOR
-# COLLECTOR_SLEEP_TIME = 1  # 从任务队列中获取任务到内存队列的间隔
-# COLLECTOR_TASK_COUNT = 10  # 每次获取任务数量
-#
-REDIS_KEY = "fwork" # 没用到
-# # SPIDER
-SPIDER_THREAD_COUNT = 1  # 爬虫并发数
-# SPIDER_SLEEP_TIME = [2, 5] # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数,包含2和5
-# SPIDER_TASK_COUNT = 1  # 每个parser从内存队列中获取任务的数量
-SPIDER_MAX_RETRY_TIMES = 5  # 每个请求最大重试次数
-# KEEP_ALIVE = False  # 爬虫是否常驻
-#
-# # 浏览器渲染
-WEBDRIVER  = dict(
+# 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
+EXPORT_DATA_MAX_FAILED_TIMES = 5
+# 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
+EXPORT_DATA_MAX_RETRY_TIMES = 5
+
+COLLECTOR_TASK_COUNT = 100  # 每次获取任务数量
+
+# 爬虫
+SPIDER_THREAD_COUNT = 1  # 爬虫并发数,追求速度推荐32
+SPIDER_MAX_RETRY_TIMES = 3  # 每个请求最大重试次数
+
+# 浏览器渲染
+WEBDRIVER = dict(
+    server_addr="http://172.17.4.232:6666/wd/hub",  # selenium 远程服务地址
+    version="",  # 浏览器版本。不指定版本时,随机分发,版本详见群公告
     pool_size=1,  # 浏览器的数量
     pool_size=1,  # 浏览器的数量
     load_images=False,  # 是否加载图片
     load_images=False,  # 是否加载图片
-    # user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
-    proxy=None,  # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
+    user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
     headless=True,  # 是否为无头浏览器
     headless=True,  # 是否为无头浏览器
-    driver_type="FIREFOX",  # CHROME、PHANTOMJS、FIREFOX
+    proxy=None,  # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
+    driver_type="FIREFOX",  # CHROME、FIREFOX、EDGE
     timeout=30,  # 请求超时时间
     timeout=30,  # 请求超时时间
+    executable_path=None,  # 浏览器路径,默认为默认路径
+    usages_local_driver=True,  # 是否使用本地驱动,默认启动本地驱动
     window_size=(1280, 800),  # 窗口大小
     window_size=(1280, 800),  # 窗口大小
-    # executable_path="D:\\geckodriver.exe",  # 浏览器路径,默认为默认路径
     render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
     render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
     custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
     custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
+    service_log_path=os.devnull  # 日志路径,默认置空
 )
 )
-#wget https://github.com/mozilla/geckodriver/releases/download/v0.25.0/geckodriver-v0.25.0-linux64.tar.gz
-# # 爬虫启动时,重新抓取失败的requests
-# RETRY_FAILED_REQUESTS = False
-# # 保存失败的request
-# SAVE_FAILED_REQUEST = True
-# # request防丢机制。(指定的REQUEST_LOST_TIMEOUT时间内request还没做完,会重新下发 重做)
-# REQUEST_LOST_TIMEOUT = 600  # 10分钟
-# # request网络请求超时时间
-# REQUEST_TIMEOUT = 22  # 等待服务器响应的超时时间,浮点数,或(connect timeout, read timeout)元组
-#
-# # 下载缓存 利用redis缓存,但由于内存大小限制,所以建议仅供开发调试代码时使用,防止每次debug都需要网络请求
-# RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
-# RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
-# RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
-#
-# # 设置代理
-PROXY_EXTRACT_API = "http://socks.spdata.jianyu360.com/socks/getips?limit=100"  # 代理提取API ,返回的代理分割符为\r\n
+# 爬虫启动时,重新入库失败的item
+RETRY_FAILED_ITEMS = True
+
+# 保存失败的request
+SAVE_FAILED_REQUEST = False
+
+# request网络请求超时时间
+REQUEST_TIMEOUT = 60
+
+# 调度器,存放item与request的根目录
+REDIS_KEY = "fwork"
+
+# 设置代理,代理提取API ,返回的代理分割符为\r\n
+PROXY_EXTRACT_API = "http://proxy.spdata.jianyu360.com/proxy/getallip"
 PROXY_ENABLE = True
 PROXY_ENABLE = True
-#
-# # 随机headers
-# RANDOM_HEADERS = True
-# # UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari','mobile' 若不指定则随机类型
-# USER_AGENT_TYPE = "chrome"
-# # 默认使用的浏览器头 RANDOM_HEADERS=True时不生效
-# DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
-# # requests 使用session
-# USE_SESSION = False
-#
-# # 去重
-# ITEM_FILTER_ENABLE = False  # item 去重
-# REQUEST_FILTER_ENABLE = False  # request 去重
-# ITEM_FILTER_SETTING = dict(
-#     filter_type=1  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
-# )
-# REQUEST_FILTER_ENABLE = True  # request 去重
-# REQUEST_FILTER_SETTING = dict(
-#     filter_type=3,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
-#     expire_time=2592000,  # 过期时间1个月
-# )
-#
-# # 报警 支持钉钉、企业微信、邮件
-# # 钉钉报警
-# DINGDING_WARNING_URL = ""  # 钉钉机器人api
-# DINGDING_WARNING_PHONE = ""  # 报警人 支持列表,可指定多个
-# DINGDING_WARNING_ALL = False # 是否提示所有人, 默认为False
-# # 邮件报警
-# EMAIL_SENDER = ""  # 发件人
-# EMAIL_PASSWORD = ""  # 授权码
-# EMAIL_RECEIVER = ""  # 收件人 支持列表,可指定多个
-# EMAIL_SMTPSERVER = "smtp.163.com" # 邮件服务器 默认为163邮箱
-# # 企业微信报警
-# WECHAT_WARNING_URL = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=89f0b1e9-8d08-4e26-a563-cd6b07b9db14"  # 企业微信机器人api
+
+# item去重
+ITEM_FILTER_ENABLE = True  # item 去重
+ITEM_FILTER_SETTING = dict(
+    filter_type=5,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、 轻量去重(LiteFilter)= 4、集群去重(SwordFishFilter)= 5
+    expire_time=63072000,  # 过期时间2年
+    redis_url=["172.17.4.239:2479", "172.17.4.240:2579", "172.17.4.84:2379"],  # 集群节点
+)
+
+# 企业微信报警
 WECHAT_WARNING_URL = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=-4e26-a563-cd6b07b9db14"  # 企业微信机器人api
 WECHAT_WARNING_URL = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=-4e26-a563-cd6b07b9db14"  # 企业微信机器人api
-WECHAT_WARNING_PHONE = "马国鹏"  # 报警人 将会在群内@此人, 支持列表,可指定多人
+WECHAT_WARNING_PHONE = "swordFish"  # 报警人 将会在群内@此人, 支持列表,可指定多人
 WECHAT_WARNING_ALL = True  # 是否提示所有人, 默认为False
 WECHAT_WARNING_ALL = True  # 是否提示所有人, 默认为False
-# # 时间间隔
+# 时间间隔
 WARNING_INTERVAL = 360  # 相同报警的报警时间间隔,防止刷屏; 0表示不去重
 WARNING_INTERVAL = 360  # 相同报警的报警时间间隔,防止刷屏; 0表示不去重
-# WARNING_LEVEL = "DEBUG"  # 报警级别, DEBUG / ERROR
-WARNING_LEVEL = "INFO"  # 报警级别, DEBUG / ERROR
+WARNING_LEVEL = "ERROR"  # 报警级别, DEBUG / ERROR
 WARNING_FAILED_COUNT = 2  # 任务失败数 超过WARNING_FAILED_COUNT则报警
 WARNING_FAILED_COUNT = 2  # 任务失败数 超过WARNING_FAILED_COUNT则报警
-#
-#LOG_NAME = os.path.basename(os.getcwd())
 
 
-DTIME = time.strftime("%Y-%m-%d", time.localtime(time.time()))
-LOG_NAME = os.path.split(sys.argv[0])[-1].split('.')[0]
-LOG_PATH = "log/%s/%s.log" %(DTIME,LOG_NAME)  # log存储路径
-LOG_LEVEL = "INFO"
+# 日志设置
+DTIME = datetime.datetime.now().strftime("%Y-%m-%d")
+LOG_NAME = os.path.split(sys.argv[0])[-1].split(".")[0]
+LOG_PATH = "log/%s/%s.log" % (DTIME, LOG_NAME)  # log存储路径
+LOG_LEVEL = "ERROR"
 LOG_COLOR = True  # 是否带有颜色
 LOG_COLOR = True  # 是否带有颜色
-LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
-# LOG_IS_WRITE_TO_FILE = True  # 是否写文件
-# LOG_MODE = "w"  # 写文件的模式
+LOG_IS_WRITE_TO_CONSOLE = True  # 是否打印到控制台
+LOG_IS_WRITE_TO_FILE = True  # 是否写文件
+LOG_MODE = "w"  # 写文件的模式
 LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
 LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
 LOG_BACKUP_COUNT = 20  # 日志文件保留数量
 LOG_BACKUP_COUNT = 20  # 日志文件保留数量
 LOG_ENCODING = "utf8"  # 日志文件编码
 LOG_ENCODING = "utf8"  # 日志文件编码
 OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级 一般用不到
 OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级 一般用不到
-#
-# # 切换工作路径为当前项目路径
-# project_path = os.path.abspath(os.path.dirname(__file__))
-# os.chdir(project_path)  # 切换工作路经
-# sys.path.insert(0, project_path)
-# print('当前工作路径为 ' + os.getcwd())
-
-# 代理服务-未解析的
-jy_proxy = {'socks5': {'url': 'http://socks.spdata.jianyu360.com/socks/getips?limit=100', 'decrypt': 'ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/'}}
-
-headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', 'Accept': '*/*'}
-
-# 文件存储功能的配置信息
-oss_={'key_id': 'LTAI4G5x9aoZx8dDamQ7vfZi', 'key_secret': 'Bk98FsbPYXcJe72n1bG3Ssf73acuNh',
-      'endpoint': 'oss-cn-beijing.aliyuncs.com', 'bucket_name': 'jy-datafile'}
-# oss_={'key_id': 'LTAI4G5x9aoZx8dDamQ7vfZi', 'key_secret': 'Bk98FsbPYXcJe72n1bG3Ssf73acuNh', 'endpoint': 'oss-cn-beijing-internal.aliyuncs.com', 'bucket_name': 'jy-editor'}
-
-author = {"dzr":"董钊瑞",'mgp':"马国鹏","lzz":"李宗泽"}
-
-# 线上代理服务的api地址
-JIANYU_PROXY_URL = 'http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch'
-JIANYU_PROXY_AUTHOR = 'Basic amlhbnl1MDAxOjEyM3F3ZSFB'
-
-# splash 渲染服务的api接口配置
-JIANYU_SPLASH_URL = 'http://8.131.72.226:8998/render.json'
-
-# 测试环境的redis集群 -- url去重专用
-REDISCLUSTER =  [
-                {"host": "192.168.3.207", "port": "2179"},
-                {"host": "192.168.3.166", "port": "2379"}
-            ]
-
-# 正式环境的redis集群 -- url去重专用
-# REDISCLUSTER =  [
-#                 {"host": "172.17.4.239", "port": "2479"},
-#                 {"host": "172.17.4.240", "port": "2579"},
-#                 {"host": "172.17.4.84", "port": "2379"}
-#             ]
+# elk服务
+LOG_IS_SEND_TO_LOGSTASH = False
+LOGSTASH_IP = "47.95.151.156"  # 已失效("47.95.151.156")
+LOGSTASH_PORT = 5044
+
+# 自建代理池
+SWORDFISH_PROXY_URL = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
+SWORDFISH_PROXY_AUTHOR = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
+
+# splash 渲染服务
+SWORDFISH_RENDER_URL = "http://59.110.6.43:8998/render.json"
+
+# 爬虫心跳
+HEARTBEAT_TABLE = "spider_heartbeat"  # 爬虫采集心跳记录表名
+
+# 远程bucket配置
+ALI_BUCKET_CONFIG = {
+    "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
+    "key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
+    "endpoint": "oss-cn-beijing-internal.aliyuncs.com",
+    "bucket_name": "jy-datafile"
+}

+ 13 - 30
FworkSpider/untils/WebCookiePool.py

@@ -1,19 +1,16 @@
-import json
 import sys
 import sys
-import requests
-import re,execjs
-
 sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 sys.path.append('/app/spiders/sword_feapder/FworkSpider')
-# from utils.cookie_pool import PageCookiePool
+
 from feapder.utils.webdriver import WebDriver
 from feapder.utils.webdriver import WebDriver
 from feapder.utils.log import log
 from feapder.utils.log import log
 from untils.cookie_pool import PageCookiePool
 from untils.cookie_pool import PageCookiePool
+import feapder.utils.tools as tools
+
 
 
 class WebCookiePool(PageCookiePool):
 class WebCookiePool(PageCookiePool):
-    def __init__(self, redis_key, page_url=None,cookie_key=None,
-                 min_cookies=10000, must_contained_keys=(), keep_alive=False, **kwargs):
-        super(WebCookiePool, self).__init__(redis_key, page_url=None,
-                                           min_cookies=10000, must_contained_keys=(), keep_alive=False, **kwargs)
+
+    def __init__(self, redis_key, page_url, cookie_key, **kwargs):
+        super(WebCookiePool, self).__init__(redis_key, **kwargs)
         self.page_url = page_url
         self.page_url = page_url
         self.cookie_key = cookie_key
         self.cookie_key = cookie_key
         self._kwargs = kwargs
         self._kwargs = kwargs
@@ -22,30 +19,16 @@ class WebCookiePool(PageCookiePool):
         self._kwargs.setdefault("driver_type", "FIREFOX")
         self._kwargs.setdefault("driver_type", "FIREFOX")
 
 
     def create_cookie(self):
     def create_cookie(self):
-        with WebDriver(**self._kwargs) as driver_pool:
-            import time
-            # time.sleep(1111)
+        with WebDriver(**self._kwargs) as browser:
             try:
             try:
-                # driver_pool = self.driver_pool.get()
-                driver_pool.get(self.page_url)
+                browser.get(self.page_url)
                 count = 0
                 count = 0
-                while self.cookie_key not in driver_pool.cookies.keys():
-                    time.sleep(1)
-                    count+=1
-                    if count>=30:
+                while self.cookie_key not in browser.cookies.keys():
+                    tools.delay_time(1)
+                    count += 1
+                    if count >= 30:
                         return
                         return
-                cookies = driver_pool.cookies
+                cookies = browser.cookies
                 return cookies
                 return cookies
             except Exception as e:
             except Exception as e:
                 log.error(f"获取cookie失败,{e}")
                 log.error(f"获取cookie失败,{e}")
-
-
-if __name__ == '__main__':
-    for i in range(10):
-        print(f'开始第{i+1}次获取cookie')
-        if i%3==0:
-            WebCookiePool(redis_key='gdcookie',cookie_key='SUB',page_url="https://weibo.com/p/1005051203448454/home?from=page_100505_profile&wvr=6&mod=data&is_all=1#place").create_cookie()
-        elif i%3==1:
-            WebCookiePool(redis_key='gd2cookie',cookie_key='locale',page_url="https://www.jianshu.com/p/4c5bc85fc3fd").create_cookie()
-        else:
-            WebCookiePool(redis_key='gd3cookie',cookie_key='cna',page_url="https://docs-next.crawlab.cn/zh/guide/installation/docker.html#%E5%A4%96%E9%83%A8-mongodb").create_cookie()

+ 13 - 2
FworkSpider/untils/__init__.py

@@ -1,7 +1,6 @@
 import oss2
 import oss2
 
 
-# from config.load import oss_conf
-from feapder.setting import oss_ as oss_conf
+from feapder.setting import ALI_BUCKET_CONFIG as oss_conf
 
 
 
 
 class AliYunService:
 class AliYunService:
@@ -22,3 +21,15 @@ class AliYunService:
         auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
         auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
         bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
         bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
         bucket.put_object_from_file(key, filename)
         bucket.put_object_from_file(key, filename)
+
+    def push_oss_from_stream(self, key, data):
+        """
+        流式上传oss
+
+        :param str key: 上传到OSS的文件名
+        :param data: 待上传的内容。
+        :type data: bytes,str或file-like object
+        """
+        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
+        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
+        bucket.put_object(key, data)

+ 212 - 169
FworkSpider/untils/attachment.py

@@ -1,20 +1,28 @@
 import hashlib
 import hashlib
+import io
 import os
 import os
-import sys
 import traceback
 import traceback
 import uuid
 import uuid
-from urllib import request
+
 import requests
 import requests
+import tqdm
 import urllib3
 import urllib3
-from feapder.setting import headers
-from untils.execptions import AttachmentNullError
+
 from untils.aliyun import AliYunService
 from untils.aliyun import AliYunService
+from untils.execptions import AttachmentNullError
 from untils.proxy_pool import ProxyPool
 from untils.proxy_pool import ProxyPool
-import time
-import tqdm
+
 urllib3.disable_warnings()
 urllib3.disable_warnings()
+
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
+    'Accept': '*/*'
+}
+
+
 class AttachmentDownloader:
 class AttachmentDownloader:
-    '''附件下载模块'''
+    """附件下载模块"""
+
     def __init__(self):
     def __init__(self):
         self.dir_name = 'file'
         self.dir_name = 'file'
 
 
@@ -22,92 +30,141 @@ class AttachmentDownloader:
         if not os.path.exists(self.dir_name):
         if not os.path.exists(self.dir_name):
             os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
             os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
 
 
-    def create_file_path(self, filename, file_type):
+    def create_file(self, filename, file_type):
         self.create_dir()
         self.create_dir()
-        sign = self.hex_sha1("{}_{}".format(filename, uuid.uuid4()))
-        tmp_name = "{}.{}".format(sign, file_type)
-        return "{}/{}".format(self.dir_name, tmp_name)
+        sign = self._hash("{}_{}".format(filename, uuid.uuid4()))
+        local_file_name = "{}.{}".format(sign, file_type)
+        return "{}/{}".format(self.dir_name, local_file_name)
+
+    def create_fid(self, data: bytes):
+        return self._hash(data)
 
 
-    def hex_sha1(self,val):
-        sha1 = hashlib.sha1()
+    @staticmethod
+    def _hash(val):
+        _sha1 = hashlib.sha1()
         if isinstance(val, bytes):
         if isinstance(val, bytes):
-            sha1.update(str(val).encode("utf-8"))
+            _sha1.update(str(val).encode("utf-8"))
         elif isinstance(val, str):
         elif isinstance(val, str):
-            sha1.update(val.encode("utf-8"))
-        res = sha1.hexdigest()
-        return res
+            _sha1.update(val.encode("utf-8"))
+        return _sha1.hexdigest()
 
 
     @staticmethod
     @staticmethod
-    def create_fid(file_stream: bytes):
-        sha1 = hashlib.sha1()
-        if isinstance(file_stream, bytes):
-            sha1.update(str(file_stream).encode("utf-8"))
-        elif isinstance(file_stream, str):
-            sha1.update(file_stream.encode("utf-8"))
-        res = sha1.hexdigest()
-        return res
+    def clean_attachment(file_path):
+        """
+        删除文件
 
 
+        :param str file_path: 文件路径
+        """
+        try:
+            os.remove(file_path)
+        except FileNotFoundError:
+            pass
+
+    def remove(self, file):
+        self.clean_attachment(file)
 
 
     @staticmethod
     @staticmethod
-    def clean_attachment(file_path):
-        os.remove(file_path)
+    def get_mb(data):
+        """
+        获取数据的Mb
+
+        :param int data: 准备计算大小的内容
+        :return: float
+        """
+        _kb = float(data / 1024.0)
+        return float(_kb / 1024.0)
 
 
     @staticmethod
     @staticmethod
-    def getsize(file_path: str):
-        def _getsize(filename):
+    def getsize(data):
+        """
+        计算数据大小
+
+        :param data: 待上传的内容。
+        :type data: bytes,str或file-like object
+        :return str
+        """
+        size = 0
+        if isinstance(data, str):
             try:
             try:
-                return os.path.getsize(filename)
-            except:
-                return 0
+                size = os.path.getsize(data)
+            except FileNotFoundError:
+                pass
+        elif isinstance(data, bytes):
+            size = len(data)
+        else:
+            pass
 
 
-        _kb = float(_getsize(file_path)) / 1024
+        _kb = float(size) / 1024
+        result = "{:.1f} kb".format(_kb)
         if _kb >= 1024:
         if _kb >= 1024:
             _M = _kb / 1024
             _M = _kb / 1024
             if _M >= 1024:
             if _M >= 1024:
                 _G = _M / 1024
                 _G = _M / 1024
-                return "{:.1f} G".format(_G)
+                result = "{:.1f} G".format(_G)
             else:
             else:
-                return "{:.1f} M".format(_M)
-        else:
-            return "{:.1f} kb".format(_kb)
+                result = "{:.1f} M".format(_M)
+        return result
 
 
-    @staticmethod
-    def _fetch_attachment(
-            url: str,
-            file_path: str,
-            enable_proxy=False,
-            allow_show_exception=False,
-            **kwargs
-    ):
+    def fetch_data(self, url, file=None, **kwargs):
+        """
+        数据下载
+
+        :param str url: 下载地址
+        :param file: 本地文件
+        :param dict kwargs: requests请求参数
+        :return:
+        """
+        enable_proxy = kwargs.pop('enable_proxy', False)
+        allow_show_exception = kwargs.pop('allow_show_exception', False)
+        method = kwargs.pop('method', 'get')
         request_params = {}
         request_params = {}
+        request_params.setdefault('data', kwargs.pop('data', None))
+        request_params.setdefault('cookies', kwargs.pop('cookies', None))
         request_params.setdefault('headers', kwargs.get('headers') or headers)
         request_params.setdefault('headers', kwargs.get('headers') or headers)
         request_params.setdefault('proxies', kwargs.get('proxies'))
         request_params.setdefault('proxies', kwargs.get('proxies'))
-        request_params.setdefault('timeout', kwargs.get('timeout') or 60)
-        # request_params.setdefault('stream', kwargs.get('stream') or True)
-        request_params.setdefault('verify', kwargs.get('verify') or False)
-        if enable_proxy:
-            proxy = ProxyPool().get()
-        else:
-            proxy = {}
+        request_params.setdefault('timeout', kwargs.pop('timeout', 60))
+        request_params.setdefault('stream', kwargs.pop('stream', True))
+        request_params.setdefault('verify', kwargs.pop('verify', False))
+        request_params.setdefault('allow_redirects', kwargs.pop('allow_redirects', True))
+
         retries = 0
         retries = 0
         while retries < 3:
         while retries < 3:
             try:
             try:
-                with requests.get(url,stream=True, **request_params) as req:
-                    content_size = req.headers.get('Content-Length') or 0
-                    content_size = int(content_size)
-                    stream = b''
-                    if req.status_code == 200:
-                        with open(file_path, 'wb') as f:
-                            with tqdm.tqdm(total=content_size, unit='B', initial=0, unit_scale=True, unit_divisor=1024,
-                                      ascii=True,desc=file_path) as bar:
-                                for chunk in req.iter_content(chunk_size=1024*20):
-                                    if chunk:
-                                        f.write(chunk)
-                                    stream += chunk
-                                    bar.update(len(chunk))
-                        return stream
-                    else:
+                with requests.request(method, url, **request_params) as req:
+                    stream = io.BytesIO()
+                    lh = {k.lower(): v for k, v in req.headers.items()}
+                    '''内容长度'''
+                    cl = lh.get('content-length') or len(req.content)
+                    icl = int(cl)
+                    content_length = self.get_mb(icl)
+                    if content_length > 50:
+                        '''丢弃超过50Mb内容长度的文件'''
+                        return stream.getvalue()
+
+                    if req.status_code != 200:
                         retries += 1
                         retries += 1
+                        continue
+
+                    iter_content = req.iter_content(chunk_size=1024 * 20)
+                    with tqdm.tqdm(
+                            total=icl,
+                            unit='B',
+                            initial=0,
+                            unit_scale=True,
+                            unit_divisor=1024,  # 1M=1024Kb,单位换算
+                            ascii=True,
+                            desc=file) as bar:
+                        if file is not None:
+                            with open(file, 'wb') as f:
+                                for chunk in iter_content:
+                                    stream.write(chunk)
+                                    size = f.write(chunk)
+                                    bar.update(size)
+                        else:
+                            for chunk in iter_content:
+                                size = stream.write(chunk)
+                                bar.update(size)
+                    return stream.getvalue()
             except requests.RequestException:
             except requests.RequestException:
                 if allow_show_exception:
                 if allow_show_exception:
                     traceback.print_exc()
                     traceback.print_exc()
@@ -116,129 +173,115 @@ class AttachmentDownloader:
                 retries += 1
                 retries += 1
         return b''
         return b''
 
 
-    def fetch_attachment(
-            self,
-            file_name: str,
-            file_type: str,
-            download_url: str,
-            enable_proxy=False,
-            allow_request_exception=False,
-            **kwargs
-    ):
-        if not file_name or not file_type or not download_url:
-            raise AttachmentNullError
-        file_path = self.create_file_path(file_name, file_type)
-        file_stream = self._fetch_attachment(
-            download_url,
-            file_path,
-            enable_proxy,
-            allow_request_exception,
-            **kwargs
-        )
-        # file_stream = self.download_file(download_url,file_path,enable_proxy,allow_request_exception)
-        if len(file_stream) > 0:
-            fid = self.create_fid(file_stream)
-            '''上传/下载,无论失败成功都需要给出文件基础信息'''
+    def _push_oss_from_stream(self, file_name, file_type, url, **kw):
+        """
+        将数据流推送oss
+
+        :param str file_name: 文件名称
+        :param str file_type: 文件类型
+        :param str url: 下载地址
+        :param dict kw: 额外下载信息
+        :return: dict: 附件信息
+        """
+        stream = self.fetch_data(url, None, **kw)
+        if len(stream) > 0:
+            fid = self.create_fid(stream)
             try:
             try:
                 result = {
                 result = {
                     'filename': file_name,
                     'filename': file_name,
                     'ftype': file_type,
                     'ftype': file_type,
                     'fid': "{}.{}".format(fid, file_type),
                     'fid': "{}.{}".format(fid, file_type),
-                    'org_url': download_url,
-                    'size': self.getsize(file_path),
+                    'org_url': url,
+                    'size': self.getsize(stream),
                     'url': 'oss',
                     'url': 'oss',
                 }
                 }
-                AliYunService().push_oss_from_local(result['fid'], file_path)
+                AliYunService().push_oss_from_stream(result['fid'], stream)
             except Exception:
             except Exception:
                 result = {
                 result = {
                     'filename': file_name,
                     'filename': file_name,
-                    'org_url': download_url,
+                    'org_url': url,
                 }
                 }
-            self.clean_attachment(file_path)
         else:
         else:
             result = {
             result = {
                 'filename': file_name,
                 'filename': file_name,
-                'org_url': download_url,
+                'org_url': url,
             }
             }
         return result
         return result
 
 
-    def download_file(self, url, file_path, call_func=None,enable_proxy=False,data=None):
+    def _push_oss_from_file(self, file_name, file_type, url, **kw):
         """
         """
-        Args:
-            url: 地址
-            file_path: 文件存储地址
-            call_func: 下载成功的回调
-        Returns:
+        将本地文件推送oss
+
+        :param str file_name: 文件名称
+        :param str file_type: 文件类型
+        :param str url: 下载地址
+        :param dict kw: 额外下载信息
+        :return: dict: 附件信息
         """
         """
-        # proxies = kwargs.get('proxies') or None
-        # data = kwargs.get('data') or None
-        start_time = time.time()
-        def progress_callfunc(blocknum, blocksize, totalsize):
-            """回调函数
-            @blocknum : 已经下载的数据块
-            @blocksize : 数据块的大小
-            @totalsize: 远程文件的大小
-            """
-            speed = (blocknum * blocksize) / (time.time() - start_time)
-            # speed_str = " Speed: %.2f" % speed
-            speed_str = " Speed: %s" % format_size(speed)
-            recv_size = blocknum * blocksize
-
-            # 设置下载进度条
-            f = sys.stdout
-            pervent = recv_size / totalsize
-            percent_str = "%.2f%%" % (pervent * 100)
-            n = round(pervent * 50)
-            s = ('#' * n).ljust(50, '-')
-            f.write(percent_str.ljust(8, ' ') + '[' + s + ']' + speed_str)
-            f.flush()
-            f.write('\r')
-
-        def format_size(bytes):
+        file = self.create_file(file_name, file_type)
+        stream = self.fetch_data(url, file, **kw)
+        '''上传/下载,无论失败成功都需要返回文件基础信息'''
+        if len(stream) > 0:
+            fid = self.create_fid(stream)
             try:
             try:
-                bytes = float(bytes)
-                kb = bytes / 1024
-            except:
-                print("传入的字节格式不对")
-                return "Error"
-            if kb >= 1024:
-                M = kb / 1024
-                if M >= 1024:
-                    G = M / 1024
-                    return "%.3fG" % (G)
-                else:
-                    return "%.3fM" % (M)
-            else:
-                return "%.3fK" % (kb)
+                result = {
+                    'filename': file_name,
+                    'ftype': file_type,
+                    'fid': "{}.{}".format(fid, file_type),
+                    'org_url': url,
+                    'size': self.getsize(file),
+                    'url': 'oss',
+                }
+                AliYunService().push_oss_from_local(result['fid'], file)
+            except Exception:
+                result = {
+                    'filename': file_name,
+                    'org_url': url,
+                }
+        else:
+            result = {
+                'filename': file_name,
+                'org_url': url,
+            }
+        '''删除本地临时文件'''
+        self.remove(file)
+        return result
 
 
-        if url:
-            try:
-                if enable_proxy:
-                    proxies = ProxyPool().get()
-                    # create the object, assign it to a variable
-                    proxy = request.ProxyHandler(proxies)
-                    # construct a new opener using your proxy settings
-                    opener = request.build_opener(proxy)
-                    # install the openen on the module-level
-                    request.install_opener(opener)
-                # 测试可以打开进度条,生产环境禁用进度条
-                filename, headers = request.urlretrieve(url, file_path, progress_callfunc, data)
-                # filename, headers = request.urlretrieve(url, file_path, data)
-                print(filename,headers)
-
-                if callable(call_func):
-                    call_func()
-                return filename
-            except Exception as e:
-                print(e)
-                return ''
+    def _fetch_attachment(self, file_name, file_type, download_url, **kwargs):
+        """
+        下载附件
+
+        :param str file_name: 文件名称
+        :param str file_type: 文件类型
+        :param str download_url: 下载地址
+        :param dict kwargs: 额外的附件下载配置
+        :return: dict: 附件
+        """
+        mode = kwargs.pop('mode', 'local')
+        if mode == "stream":
+            res = self._push_oss_from_stream(
+                file_name,
+                file_type,
+                download_url,
+                **kwargs
+            )
         else:
         else:
-            return ''
+            res = self._push_oss_from_file(
+                file_name,
+                file_type,
+                download_url,
+                **kwargs
+            )
+        return res
 
 
-if __name__ == '__main__':
+    def fetch_attachment(
+            self,
+            file_name: str,
+            file_type: str,
+            download_url: str,
+            **kw
+    ):
+        if not file_name or not file_type or not download_url:
+            raise AttachmentNullError
 
 
-    url = 'https://gdgpo.czt.gd.gov.cn/gpx-bid-file/440606/gpx-tender/2022/5/9/8a7e15d780a438400180a6be91e90cb2.zip?accessCode=0cf1d12a48345bcb7e64ac9583e30207'
-    attachment = AttachmentDownloader().fetch_attachment(
-        file_name="file_name", file_type="pdf", download_url=url,
-        enable_proxy=False)
-    print(attachment)
+        return self._fetch_attachment(file_name, file_type, download_url, **kw)

+ 0 - 61
FworkSpider/untils/chaojiying.py

@@ -1,61 +0,0 @@
-#!/usr/bin/env python
-# coding:utf-8
-
-import requests
-from hashlib import md5
-
-class Chaojiying_Client(object):
-
-    def __init__(self, username, password, soft_id):
-        self.username = username
-        password =  password.encode('utf8')
-        self.password = md5(password).hexdigest()
-        self.soft_id = soft_id
-        self.base_params = {
-            'user': self.username,
-            'pass2': self.password,
-            'softid': self.soft_id,
-        }
-        self.headers = {
-            'Connection': 'Keep-Alive',
-            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
-        }
-
-    def PostPic(self, im, codetype):
-        """
-        im: 图片字节
-        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
-        """
-        params = {
-            'codetype': codetype,
-        }
-        params.update(self.base_params)
-        files = {'userfile': ('ccc.jpg', im)}
-        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
-        return r.json()
-
-    def ReportError(self, im_id):
-        """
-        im_id:报错题目的图片ID
-        """
-        params = {
-            'id': im_id,
-        }
-        params.update(self.base_params)
-        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
-        return r.json()
-
-
-if __name__ == '__main__':
-    # chaojiying = Chaojiying_Client('ddddjy', 'ddddjy2021', '超级鹰')	#用户中心>>软件ID 生成一个替换 96001
-    # im = open('a.jpg', 'rb').read()													#本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
-    # # print(chaojiying.PostPic(im, 1902))
-    # res = chaojiying.PostPic(im, 2004)
-    # print(res)
-    # if res.get("err_no") != 0:
-    #     chaojiying.ReportError(res.get("pic_id"))
-    # if res.get("")
-    code = "haoho"
-    url = 'http://www.ccgp-fujian.gov.cn/3500/noticelist/e8d2cd51915e4c338dc1c6ee2f02b127/?page={page}&verifycode=胡吃海喝'[:-4]+code
-
-    print(url)

+ 0 - 0
FworkSpider/untils/clean_html/__init__.py


+ 0 - 131
FworkSpider/untils/clean_html/defaults.py

@@ -1,131 +0,0 @@
-import re
-
-__all__ = ['cleaner']
-
-# 独立元素
-INDEPENDENT_TAGS = {
-    '<head>[\s\S]*?</head>': '',
-    '<html>|<html [^>]*>|</html>': '',
-    '<body>|<body [^>]*>|</body>': '',
-    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
-    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
-    '\\xa0|\\u3000': '',  # 空格
-    '<!--[\s\S]*?-->': '',  # 注释
-    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
-    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
-    '<input>': '',  # 输入框
-    '<img[^>]*>': '<br>',  # 图片
-}
-# 行内元素
-INLINE_TAGS = {
-    '<a>|<a [^>]*>|</a>': '',  # 超链接
-    '<span>|<span [^>]*>|</span>': '',  # span
-    '<label>|<label [^>]*>|</label>': '<br>',  # label
-    '<font>|<font [^>]*>|</font>': '',  # font
-}
-# 块级元素
-BLOCK_TAGS = {
-    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
-    '<p>|<p [^>]*>|</p>': '<br>',  # 段落
-    '<div>|<div [^>]*>|</div>': '<br>',  # 分割 division
-    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
-}
-# 其他
-OTHER = {
-    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
-    '<epointform>': '',
-    '<!doctype html>|<!doctype html [^>]*>': '',
-    '【关闭】|关闭': '',
-    '【打印】|打印本页': '',
-    '【字体:[\s\S]*】': '',
-    '文章来源:[\u4e00-\u9fa5]+': '',
-    '浏览次数:.*[<]+': '',
-    '(责任编辑:.*?)': '',
-    '分享到[:]': '',
-}
-# 样式
-CSS_STYLE = {
-    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
-    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
-    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
-    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
-    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
-    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
-}
-# 空白符
-BLANKS = {
-    '\n\s*\n': '\n',
-    '\s*\n\s*': '\n',
-    '[^\S\n]': ' ',
-    '\s+': ' ',
-}
-# css标签集合
-TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
-# css属性集合
-ATTRS = {'id', 'class', 'style', 'width'}
-
-
-def _repair_tag():
-    """异常的标签组合,用来替换非标准页面的标签"""
-    _repairs = {}
-    for tag in TAGS:
-        for attr in ATTRS:
-            key = '{}{}'.format(tag, attr)
-            val = '{} {}'.format(tag, attr)
-            _repairs[key] = val
-    return _repairs
-
-
-def _escape_character(html):
-    """转义字符"""
-    html = html.replace('&lt;', '<')
-    html = html.replace('&gt;', '>')
-    html = html.replace('&quot;', '"')
-    html = html.replace('&amp;', '&')
-    return html
-
-
-def _lowercase_tag(html):
-    """标签归一化处理(全部小写)"""
-    tags = re.findall("<[^>]+>", html)
-    for tag in tags:
-        html = html.replace(tag, str(tag).lower())
-
-    repair_tags = _repair_tag()
-    for err, right in repair_tags.items():
-        html = html.replace(err, right)
-
-    return html
-
-
-def cleaner(html, special=None, completely=False):
-    """
-    数据清洗
-
-    :param html: 清洗的页面
-    :param special: 额外指定页面清洗规则
-    :param completely: 是否完全清洗页面
-    :return: 清洗后的页面源码
-    """
-    if special is None:
-        special = {}
-    OTHER.update(special)
-    remove_tags = {
-        **INDEPENDENT_TAGS,
-        **INLINE_TAGS,
-        **BLOCK_TAGS,
-        **OTHER,
-        **CSS_STYLE,
-        **BLANKS,
-    }
-    html = _lowercase_tag(html)
-    for tag, repl in remove_tags.items():
-        html = re.sub(tag, repl, html)
-
-    if completely:
-        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
-        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
-        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
-
-    html = _escape_character(html)
-    return html

+ 0 - 136
FworkSpider/untils/cleaner.py

@@ -1,136 +0,0 @@
-import re
-__all__ = ['cleaner']
-
-# 独立元素
-INDEPENDENT_TAGS = {
-    '<head>[\s\S]*?</head>': '',
-    '<html>|<html [^>]*>|</html>': '',
-    '<body>|<body [^>]*>|</body>': '',
-    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
-    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
-    '\\xa0|\\u3000': '',  # 空格
-    '<!--[\s\S]*?-->': '',  # 注释
-    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
-    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
-    '<input>': '',  # 输入框
-    '<img[^>]*>': '<br>',  # 图片
-}
-# 行内元素
-INLINE_TAGS = {
-    '<a>|<a [^>]*>|</a>': '',  # 超链接
-    '<link>|<link [^>]*>|</link>': '',  # 超链接
-    '<span>|<span [^>]*>|</span>': '',  # span
-    '<label>|<label [^>]*>|</label>': '<br>',  # label
-    '<font>|<font [^>]*>|</font>': '',  # font
-}
-# 块级元素
-BLOCK_TAGS = {
-    '<div>\s*?</div>':'',
-    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
-    '<p>|<p [^>]*>': '<br>',  # 段落
-    '</p>': '',  # 段落
-    '<div>|<div [^>]*>': '<br>',  # 分割 division
-    '</div>': '',  # 分割 division
-    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
-}
-# 其他
-OTHER = {
-    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
-    '<epointform>': '',
-    '<!doctype html>|<!doctype html [^>]*>': '',
-    '【关闭】|关闭': '',
-    '【打印】|打印本页': '',
-    '【字体:[\s\S]*】': '',
-    '文章来源:[\u4e00-\u9fa5]+': '',
-    '浏览次数:.*[<]+': '',
-    '(责任编辑:.*?)': '',
-    '分享到[:]': '',
-
-}
-# 样式
-CSS_STYLE = {
-    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
-    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
-    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
-    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
-    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
-    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
-
-}
-# 空白符
-BLANKS = {
-    '\n\s*\n': '\n',
-    '\s*\n\s*': '\n',
-    '[^\S\n]': ' ',
-    '\s+': ' ',
-}
-# css标签集合
-TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
-# css属性集合
-ATTRS = {'id', 'class', 'style', 'width'}
-
-
-def _repair_tag():
-    """异常的标签组合,用来替换非标准页面的标签"""
-    _repairs = {}
-    for tag in TAGS:
-        for attr in ATTRS:
-            key = '{}{}'.format(tag, attr)
-            val = '{} {}'.format(tag, attr)
-            _repairs[key] = val
-    return _repairs
-
-
-def _escape_character(html):
-    """转义字符"""
-    html = html.replace('&lt;', '<')
-    html = html.replace('&gt;', '>')
-    html = html.replace('&quot;', '"')
-    html = html.replace('&amp;', '&')
-    return html
-
-
-def _lowercase_tag(html):
-    """标签归一化处理(全部小写)"""
-    tags = re.findall("<[^>]+>", html)
-    for tag in tags:
-        html = html.replace(tag, str(tag).lower())
-
-    repair_tags = _repair_tag()
-    for err, right in repair_tags.items():
-        html = html.replace(err, right)
-
-    return html
-
-
-def cleaner(html, special=None, completely=False):
-    """
-    数据清洗
-
-    :param html: 清洗的页面
-    :param special: 额外指定页面清洗规则
-    :param completely: 是否完全清洗页面
-    :return: 清洗后的页面源码
-    """
-    if special is None:
-        special = {}
-    OTHER.update(special)
-    remove_tags = {
-        **INDEPENDENT_TAGS,
-        **INLINE_TAGS,
-        **BLOCK_TAGS,
-        **OTHER,
-        **CSS_STYLE,
-        **BLANKS,
-    }
-    html = _lowercase_tag(html)
-    for tag, repl in remove_tags.items():
-        html = re.sub(tag, repl, html)
-
-    if completely:
-        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
-        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
-        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
-
-    html = _escape_character(html)
-    return html

+ 62 - 654
FworkSpider/untils/cookie_pool.py

@@ -1,227 +1,50 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018/12/27 11:32 AM
----------
-@summary: cookie池
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import abc
-import datetime
-import random
 import time
 import time
-import warnings
 from collections import Iterable
 from collections import Iterable
-from enum import Enum, unique
-import requests
-from feapder.db.mongodb import MongoDB
 
 
-import feapder.utils.tools as tools
-from feapder import setting
-from feapder.network import user_agent
+from func_timeout import func_set_timeout
 
 
-from feapder.db.mysqldb import MysqlDB
+import feapder.utils.tools as tools
+from feapder.db.mongodb import MongoDB
 from feapder.db.redisdb import RedisDB
 from feapder.db.redisdb import RedisDB
-from feapder.utils import metrics
+from feapder.network.cookie_pool import (
+    CookiePoolInterface,
+    PageCookiePool,
+    User,
+)
 from feapder.utils.log import log
 from feapder.utils.log import log
 from feapder.utils.redis_lock import RedisLock
 from feapder.utils.redis_lock import RedisLock
-from feapder.utils.tools import send_msg
-from feapder.utils.webdriver import WebDriver
-
+from feapder.utils.tools import get_current_date
 
 
-class CookiePoolInterface(metaclass=abc.ABCMeta):
-    """
-    cookie pool interface
-    """
-
-    @abc.abstractmethod
-    def create_cookie(self, *args, **kwargs):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def get_cookie(self, *args, **kwargs):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def del_cookie(self, *args, **kwargs):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def run(self):
-        raise NotImplementedError
-
-
-class PageCookiePool(CookiePoolInterface):
-    """
-    由页面产生的cookie 不需要用户登陆
-    """
-
-    def __init__(
-        self,
-        redis_key,
-        page_url=None,
-        min_cookies=10000,
-        must_contained_keys=(),
-        keep_alive=False,
-        **kwargs,
-    ):
-        """
-        @param redis_key: 项目名
-        @param page_url: 生产cookie的url
-        @param min_cookies: 最小cookie数
-        @param must_contained_keys: cookie 必须包含的key
-        @param keep_alive: 当cookie数量足够是是否保持随时待命,生产cookie的状态。False为否,满足则退出
-        ---
-        @param kwargs: WebDriver的一些参数
-            load_images: 是否加载图片
-            user_agent_pool: user-agent池 为None时不使用
-            proxies_pool: ;代理池 为None时不使用
-            headless: 是否启用无头模式
-            driver_type: web driver 类型
-            timeout: 请求超时时间 默认16s
-            window_size: 屏幕分辨率 (width, height)
-
-        """
-
-        self._redisdb = RedisDB()
-
-        self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
-        self._tab_cookie_pool_last_count = "{}:str_cookie_pool_count".format(
-            redis_key
-        )  # 存储上一次统计cookie 数量的时间,格式为 时间戳:数量
-        self._page_url = page_url
-        self._min_cookies = min_cookies
-        self._must_contained_keys = must_contained_keys
-        self._keep_alive = keep_alive
-
-        self._kwargs = kwargs
-        self._kwargs.setdefault("load_images", False)
-        self._kwargs.setdefault("headless", True)
-
-    def create_cookie(self):
-        """
-        可能会重写
-        @return:
-        """
-        url = self._page_url
-        header = {
-            "Upgrade-Insecure-Requests": "1",
-            "User-Agent": user_agent.get()
-        }
-        res = requests.get(url, headers=header)
-        cookies = requests.utils.dict_from_cookiejar(res.cookies)
-        return cookies
-
-
-    def add_cookies(self, cookies):
-        log.info("添加cookie {}".format(cookies))
-        self._redisdb.lpush(self._tab_cookie_pool, cookies)
-    def run(self):
-        while True:
-            try:
-                now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool)
-                need_cookie_count = self._min_cookies - now_cookie_count
-
-                if need_cookie_count > 0:
-                    log.info(
-                        "当前cookie数为 {} 小于 {}, 生产cookie".format(
-                            now_cookie_count, self._min_cookies
-                        )
-                    )
-                    try:
-                        print('????')
-                        cookies = self.create_cookie()
-                        if cookies:
-                            self.add_cookies(cookies)
-                    except Exception as e:
-                        log.exception(e)
-                else:
-                    log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count))
-
-                    # 判断cookie池近一分钟数量是否有变化,无变化则认为爬虫不再用了,退出
-                    last_count_info = self._redisdb.strget(
-                        self._tab_cookie_pool_last_count
-                    )
-                    if not last_count_info:
-                        self._redisdb.strset(
-                            self._tab_cookie_pool_last_count,
-                            "{}:{}".format(time.time(), now_cookie_count),
-                        )
-                    else:
-                        last_time, last_count = last_count_info.split(":")
-                        last_time = float(last_time)
-                        last_count = int(last_count)
-
-                        if time.time() - last_time > 60:
-                            if now_cookie_count == last_count:
-                                log.info("近一分钟,cookie池数量无变化,判定爬虫未使用,退出生产")
-                                break
-                            else:
-                                self._redisdb.strset(
-                                    self._tab_cookie_pool_last_count,
-                                    "{}:{}".format(time.time(), now_cookie_count),
-                                )
-
-                    if self._keep_alive:
-                        log.info("sleep 10")
-                        tools.delay_time(10)
-                    else:
-                        break
-
-            except Exception as e:
-                log.exception(e)
-                tools.delay_time(1)
-
-    def get_cookie(self, wait_when_null=True):
-        while True:
-            try:
-                cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool)
-                if not cookie_info and wait_when_null:
-                    log.info("暂无cookie 生产中...")
-                    self._keep_alive = False
-                    self._min_cookies = 1
-                    with RedisLock(
-                        key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5
-                    ) as _lock:
-                        if _lock.locked:
-                            self.run()
-                    continue
-                return eval(cookie_info) if cookie_info else {}
-            except Exception as e:
-                log.exception(e)
-                tools.delay_time(1)
-
-    def del_cookie(self, cookies):
-        self._redisdb.lrem(self._tab_cookie_pool, cookies)
-
-# PageCookiePool('cookie_1',page_url="https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoListInit.do").create_cookie()
-class User:
-    def __init__(self, username, cookie):
-        self.username = username
-        self.cookie = cookie
+__all__ = [
+    "PageCookiePool",
+    "User",
+    "LoginCookiePool"
+]
 
 
 
 
 class LoginCookiePool(CookiePoolInterface):
 class LoginCookiePool(CookiePoolInterface):
     """
     """
-    需要登陆的cookie池, 用户账号密码等信息用mysql保存
+    需要登陆的cookie池, 用户账号密码等信息用mongoDB保存
     """
     """
 
 
     def __init__(
     def __init__(
-        self,
-        redis_key,
-        *,
-        table_userbase,
-        login_state_key="login_state",
-        lock_state_key="lock_state",
-        username_key="username",
-        password_key="password",
-        login_retry_times=10,
+            self,
+            redis_key,
+            *,
+            login_site,
+            table_userbase="feapder_login",
+            table_login_record="feapder_login_record",
+            login_state_key="login_state",
+            lock_state_key="lock_state",
+            username_key="username",
+            password_key="password",
+            login_retry_times=10,
     ):
     ):
         """
         """
         @param redis_key: 项目名
         @param redis_key: 项目名
+        @param login_site: 网站名称
         @param table_userbase: 用户表名
         @param table_userbase: 用户表名
+        @param table_login_record: 用户登录状态表名
         @param login_state_key: 登录状态列名
         @param login_state_key: 登录状态列名
         @param lock_state_key: 封锁状态列名
         @param lock_state_key: 封锁状态列名
         @param username_key: 登陆名列名
         @param username_key: 登陆名列名
@@ -232,15 +55,15 @@ class LoginCookiePool(CookiePoolInterface):
         self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
         self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
         self._login_retry_times = login_retry_times
         self._login_retry_times = login_retry_times
         self._table_userbase = table_userbase
         self._table_userbase = table_userbase
+        self._table_login_record = table_login_record
         self._login_state_key = login_state_key
         self._login_state_key = login_state_key
         self._lock_state_key = lock_state_key
         self._lock_state_key = lock_state_key
         self._username_key = username_key
         self._username_key = username_key
         self._password_key = password_key
         self._password_key = password_key
-
+        self._login_site = login_site
         self._redisdb = RedisDB()
         self._redisdb = RedisDB()
         self._mongo = MongoDB(db='user_login')
         self._mongo = MongoDB(db='user_login')
 
 
-
     def create_cookie(self, username, password):
     def create_cookie(self, username, password):
 
 
         """
         """
@@ -257,7 +80,12 @@ class LoginCookiePool(CookiePoolInterface):
         @return: yield username, password
         @return: yield username, password
         """
         """
 
 
-        return self._mongo.find(self._table_userbase,{self._lock_state_key:0,self._login_state_key:0})
+        query = {
+            "site": self._login_site,
+            self._lock_state_key: 0,
+            self._login_state_key: 0
+        }
+        return self._mongo.find(self._table_userbase, query)
 
 
     def handle_login_failed_user(self, username, password):
     def handle_login_failed_user(self, username, password):
         """
         """
@@ -279,14 +107,19 @@ class LoginCookiePool(CookiePoolInterface):
 
 
     def save_cookie(self, username, cookie):
     def save_cookie(self, username, cookie):
         user_cookie = {"username": username, "cookie": cookie}
         user_cookie = {"username": username, "cookie": cookie}
-
         self._redisdb.lpush(self._tab_cookie_pool, user_cookie)
         self._redisdb.lpush(self._tab_cookie_pool, user_cookie)
         self._mongo.add(
         self._mongo.add(
-                coll_name=self._table_userbase,
-                data={self._login_state_key:1},
-                update_columns=self._username_key,
-                update_columns_value=username)
+            coll_name=self._table_login_record,
+            data={self._login_state_key: 1,
+                  "status": "create",
+                  "site": self._login_site,
+                  "login_time": time.strftime("%Y-%m-%d %H:%M:%S",
+                                              time.localtime(
+                                                  int(round(time.time()))))},
+            update_columns=self._username_key,
+            update_columns_value=username)
 
 
+    @func_set_timeout(60)
     def get_cookie(self, wait_when_null=True) -> User:
     def get_cookie(self, wait_when_null=True) -> User:
         while True:
         while True:
             try:
             try:
@@ -315,22 +148,30 @@ class LoginCookiePool(CookiePoolInterface):
         self._redisdb.lrem(self._tab_cookie_pool, user_info)
         self._redisdb.lrem(self._tab_cookie_pool, user_info)
 
 
         self._mongo.add(
         self._mongo.add(
-            coll_name=self._table_userbase,
-            data={self._login_state_key: 1},
+            coll_name=self._table_login_record,
+            data={
+                self._login_state_key: 1,
+                "status": "remove",
+                "site": self._login_site,
+                "login_time": get_current_date()
+            },
             update_columns=self._username_key,
             update_columns=self._username_key,
             update_columns_value=user.username)
             update_columns_value=user.username)
 
 
     def user_is_locked(self, user: User):
     def user_is_locked(self, user: User):
-
         self._mongo.add(
         self._mongo.add(
-            coll_name=self._table_userbase,
-            data={self._lock_state_key: 1},
+            coll_name=self._table_login_record,
+            data={
+                self._lock_state_key: 1,
+                "site": self._login_site,
+                "login_time": get_current_date()
+            },
             update_columns=self._username_key,
             update_columns=self._username_key,
             update_columns_value=user.username)
             update_columns_value=user.username)
 
 
     def run(self):
     def run(self):
         with RedisLock(
         with RedisLock(
-            key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=100
+                key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=100
         ) as _lock:
         ) as _lock:
             if _lock.locked:
             if _lock.locked:
                 user_infos = self.get_user_info()
                 user_infos = self.get_user_info()
@@ -349,7 +190,8 @@ class LoginCookiePool(CookiePoolInterface):
                             if cookie:
                             if cookie:
                                 self.save_cookie(username, cookie)
                                 self.save_cookie(username, cookie)
                             else:
                             else:
-                                self.handle_login_failed_user(username, password)
+                                self.handle_login_failed_user(username,
+                                                              password)
 
 
                             break
                             break
                         except Exception as e:
                         except Exception as e:
@@ -359,437 +201,3 @@ class LoginCookiePool(CookiePoolInterface):
                         self.handle_login_failed_user(username, password)
                         self.handle_login_failed_user(username, password)
 
 
     login = run
     login = run
-
-
-@unique
-class LimitTimesUserStatus(Enum):
-    # 使用状态
-    USED = "used"
-    SUCCESS = "success"
-    OVERDUE = "overdue"  # cookie 过期
-    SLEEP = "sleep"
-    EXCEPTION = "exception"
-    # 登陆状态
-    LOGIN_SUCCESS = "login_success"
-    LOGIN_FALIED = "login_failed"
-
-
-class LimitTimesUser:
-    """
-    有次数限制的账户
-    基于本地做的缓存,不支持多进程调用
-    """
-
-    ACCOUNT_INFO_KEY = "accounts:h_account_info"  # 存储cookie的redis key
-    SITE_NAME = ""  # 网站名
-
-    redisdb = None
-
-    def __init__(
-        self,
-        username,
-        password,
-        max_search_times,
-        proxies=None,
-        search_interval=0,
-        **kwargs,
-    ):
-        """
-        @param username:
-        @param password:
-        @param max_search_times:
-        @param proxies:
-        @param search_interval: 调用时间间隔。 支持元组 指定间隔的时间范围 如(5,10)即5到10秒;或直接传整数
-        """
-        self.__dict__.update(kwargs)
-        self.username = username
-        self.password = password
-        self.max_search_times = max_search_times
-        self.proxies = proxies
-        self.search_interval = search_interval
-        self.delay_use = 0  # 延时使用,用于等待解封的用户
-
-        if isinstance(search_interval, (tuple, list)):
-            if len(search_interval) != 2:
-                raise ValueError("search_interval 需传递两个值的元组或列表。如(5,10)即5到10秒")
-
-            self.used_for_time_length = (
-                search_interval[1] * 5
-            )  # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占
-        else:
-            self.used_for_time_length = (
-                search_interval * 5
-            )  # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占
-
-        self.account_info = {
-            "login_time": 0,
-            "cookies": {},
-            "search_times": 0,
-            "last_search_time": 0,
-            "used_for_spider_name": None,  # 只被某个爬虫使用 其他爬虫不可使用
-            "init_search_times_time": 0,  # 初始化搜索次数的时间
-        }
-
-        if not self.__class__.redisdb:
-            self.__class__.redisdb = RedisDB()
-
-        self.sync_account_info_from_redis()
-
-        self.__init_metrics()
-
-    def __init_metrics(self):
-        """
-        初始化打点系统
-        @return:
-        """
-        metrics.init(**setting.METRICS_OTHER_ARGS)
-
-    def record_user_status(self, status: LimitTimesUserStatus):
-        metrics.emit_counter(f"{self.username}:{status.value}", 1, classify="users")
-
-    def __repr__(self):
-        return "<LimitTimesUser {} | cookies:{}>".format(self.username, self.cookies)
-
-    def __eq__(self, other):
-        return self.username == other.username
-
-    def sync_account_info_from_redis(self):
-        account_info = self.redisdb.hget(self.ACCOUNT_INFO_KEY, self.username)
-        if account_info:
-            account_info = eval(account_info)
-            self.account_info.update(account_info)
-
-    @property
-    def cookies(self):
-        cookies = self.account_info.get("cookies")
-        return cookies
-
-    def set_cookies(self, cookies):
-        self.account_info["cookies"] = cookies
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    def set_login_time(self, login_time=None):
-        self.account_info["login_time"] = login_time or time.time()
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    def get_login_time(self):
-        return self.account_info.get("login_time")
-
-    def is_time_to_login(self):
-        return time.time() - self.get_login_time() > 40 * 60
-
-    def get_last_search_time(self):
-        return self.account_info.get("last_search_time", 0)
-
-    def is_time_to_search(self):
-        if self.delay_use:
-            is_time = time.time() - self.get_last_search_time() > self.delay_use
-            if is_time:
-                self.delay_use = 0
-
-        else:
-            is_time = time.time() - self.get_last_search_time() > (
-                random.randint(*self.search_interval)
-                if isinstance(self.search_interval, (tuple, list))
-                else self.search_interval
-            )
-
-        return is_time
-
-    @property
-    def used_for_spider_name(self):
-        return self.account_info.get("used_for_spider_name")
-
-    @used_for_spider_name.setter
-    def used_for_spider_name(self, spider_name):
-        self.account_info["used_for_spider_name"] = spider_name
-
-    def update_status(self):
-        """
-        更新search的一些状态
-        @return:
-        """
-        self.account_info["search_times"] += 1
-        self.account_info["last_search_time"] = time.time()
-
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    @property
-    def search_times(self):
-        init_search_times_time = self.account_info.get("init_search_times_time")
-        current_time = time.time()
-        if (
-            current_time - init_search_times_time >= 86400
-        ):  # 如果距离上次初始化搜索次数时间大于1天,则搜索次数清清零
-            self.account_info["search_times"] = 0
-            self.account_info["init_search_times_time"] = current_time
-
-            self.redisdb.hset(self.ACCOUNT_INFO_KEY, self.username, self.account_info)
-
-        return self.account_info["search_times"]
-
-    def is_overwork(self):
-        if self.search_times > self.max_search_times:
-            log.warning("账号 {} 请求次数超限制".format(self.username))
-            return True
-
-        return False
-
-    def is_at_work_time(self):
-        if datetime.datetime.now().hour in list(range(7, 23)):
-            return True
-
-        log.warning("账号 {} 不再工作时间内".format(self.username))
-        return False
-
-    def del_cookie(self):
-        self.account_info["cookies"] = {}
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    def create_cookie(self):
-        """
-        生产cookie 有异常需要抛出
-        @return: cookie_dict
-        """
-
-        raise NotImplementedError
-
-    def login(self):
-        """
-        @return: 1 成功 0 失败
-        """
-
-        try:
-            # 预检查
-            if not self.is_time_to_login():
-                log.info("此账号尚未到登陆时间: {}".format(self.username))
-                time.sleep(5)
-                return 0
-
-            cookies = self.create_cookie()
-            if not cookies:
-                raise Exception("登陆失败 未获取到合法cookie")
-
-            if not isinstance(cookies, dict):
-                raise Exception("cookie 必须为字典格式")
-
-            # 保存cookie
-            self.set_login_time()
-            self.set_cookies(cookies)
-            log.info("登录成功 {}".format(self.username))
-            self.record_user_status(LimitTimesUserStatus.LOGIN_SUCCESS)
-            return 1
-
-        except Exception as e:
-            log.exception(e)
-            send_msg(
-                msg=f"{self.SITE_NAME} {self.username} 账号登陆异常 exception: {str(e)}",
-                level="error",
-                message_prefix=f"{self.SITE_NAME} {self.username} 账号登陆异常",
-            )
-
-        log.info("登录失败 {}".format(self.username))
-        self.record_user_status(LimitTimesUserStatus.LOGIN_FALIED)
-        return 0
-
-
-class LimitTimesUserPool:
-    """
-    限制查询次数的用户的User pool
-    基于本地做的缓存,不支持多进程调用
-    """
-
-    LOAD_USER_INTERVAL = 60
-
-    def __init__(self, *, accounts_dict, limit_user_class, support_more_client=True):
-        """
-        @param accounts_dic: 账户信息字典
-            {
-                "15011300228": {
-                    "password": "300228",
-                    "proxies": {},
-                    "max_search_times": 500,
-                    "search_interval": 1, # 使用时间间隔
-                    # 其他携带信息
-                }
-            }
-        @param limit_user_class: 用户重写的 limit_user_class
-        @param support_more_client: 是否支持多客户端 即多线程 多进程模式 (可能在计数上及使用频率上有些误差)
-        """
-        self.accounts_dict = accounts_dict
-        self.limit_user_class = limit_user_class
-
-        self.limit_times_users = []
-        self.current_user_index = -1
-
-        self.support_more_client = support_more_client
-
-        self.last_load_user_time = 0
-
-    def __load_users(self, username=None):
-        # 装载user
-        log.info("更新可用用户")
-
-        for _username, detail in self.accounts_dict.items():
-            if username and username != _username:
-                continue
-
-            limit_times_users = self.limit_user_class(username=_username, **detail)
-            if limit_times_users in self.limit_times_users:
-                continue
-
-            if limit_times_users.is_overwork():
-                continue
-            else:
-                if (
-                    limit_times_users.cookies or limit_times_users.login()
-                ):  # 如果有cookie 或者登陆成功 则添加到可用的user队列
-                    self.limit_times_users.append(limit_times_users)
-
-        self.last_load_user_time = time.time()
-
-    def get_user(
-        self,
-        username=None,
-        used_for_spider_name=None,
-        wait_when_null=True,
-        not_limit_frequence=False,
-    ) -> LimitTimesUser:
-        """
-        @params username: 获取指定的用户
-        @params used_for_spider_name: 独享式使用,独享爬虫的名字。其他爬虫不可抢占
-        @params wait_when_null: 无用户时是否等待
-        @params not_limit_frequence: 不限制使用频率
-        @return: LimitTimesUser
-        """
-        if not self.support_more_client:
-            warnings.warn(
-                "LimitTimesUserCookiePool 取查询次数等信息时基于本地做的缓存,不支持多进程或多线程",
-                category=Warning,
-            )
-            self._is_show_warning = True
-
-        while True:
-            if (
-                not self.limit_times_users
-                or time.time() - self.last_load_user_time >= self.LOAD_USER_INTERVAL
-            ):
-                self.__load_users(username)
-                if not self.limit_times_users:
-                    log.warning("无可用的用户")
-                    if wait_when_null:
-                        time.sleep(1)
-                        continue
-                    else:
-                        return None
-
-            self.current_user_index += 1
-            self.current_user_index = self.current_user_index % len(
-                self.limit_times_users
-            )
-
-            limit_times_user = self.limit_times_users[self.current_user_index]
-            if self.support_more_client:  # 需要先同步下最新数据
-                limit_times_user.sync_account_info_from_redis()
-
-            if username and limit_times_user.username != username:
-                log.info(
-                    "{} 为非指定用户 {}, 获取下一个用户".format(limit_times_user.username, username)
-                )
-                time.sleep(1)
-                continue
-
-            # 独占式使用,若为其他爬虫,检查等待使用时间是否超过独占时间,若超过则可以使用
-            if (
-                limit_times_user.used_for_spider_name
-                and limit_times_user.used_for_spider_name != used_for_spider_name
-            ):
-                wait_time = time.time() - limit_times_user.get_last_search_time()
-                if wait_time < limit_times_user.used_for_time_length:
-                    log.info(
-                        "用户{} 被 {} 爬虫独占,需等待 {} 秒后才可使用".format(
-                            limit_times_user.username,
-                            limit_times_user.used_for_spider_name,
-                            limit_times_user.used_for_time_length - wait_time,
-                        )
-                    )
-                    time.sleep(1)
-                    continue
-
-            if (
-                not limit_times_user.is_overwork()
-                and limit_times_user.is_at_work_time()
-            ):
-                if not limit_times_user.cookies:
-                    self.limit_times_users.remove(limit_times_user)
-                    continue
-
-                if not_limit_frequence or limit_times_user.is_time_to_search():
-                    limit_times_user.used_for_spider_name = used_for_spider_name
-
-                    limit_times_user.update_status()
-                    log.info("使用用户 {}".format(limit_times_user.username))
-                    limit_times_user.record_user_status(LimitTimesUserStatus.USED)
-                    return limit_times_user
-                else:
-                    log.info("{} 用户使用间隔过短 查看下一个用户".format(limit_times_user.username))
-                    time.sleep(1)
-                    continue
-            else:
-                self.limit_times_users.remove(limit_times_user)
-                self.current_user_index -= 1
-
-                if not limit_times_user.is_at_work_time():
-                    log.warning("用户 {} 不在工作时间".format(limit_times_user.username))
-                    if wait_when_null:
-                        time.sleep(30)
-                        continue
-                    else:
-                        return None
-
-    def del_user(self, username):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.del_cookie()
-                self.limit_times_users.remove(limit_times_user)
-                limit_times_user.record_user_status(LimitTimesUserStatus.OVERDUE)
-                self.__load_users(username)
-                break
-
-    def update_cookies(self, username, cookies):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.set_cookies(cookies)
-                break
-
-    def delay_use(self, username, delay_seconds):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.delay_use = delay_seconds
-                limit_times_user.record_user_status(LimitTimesUserStatus.SLEEP)
-                break
-
-    def record_success_user(self, username):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.record_user_status(LimitTimesUserStatus.SUCCESS)
-
-    def record_exception_user(self, username):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.record_user_status(LimitTimesUserStatus.EXCEPTION)
-
-# if __name__ == '__main__':
-#     cookiepool = PageCookiePool(redis_key='fwork:gszfcg',
-#                                 page_url='http://www.ccgp-hubei.gov.cn/notice/cgyxgg/index_1.html',
-#                                 driver_type='FIREFOX',
-#                                 executable_path="D:\\geckodriver.exe")
-#     cookiepool.create_cookie()

+ 0 - 33
FworkSpider/untils/create_menus.py

@@ -1,33 +0,0 @@
-from feapder.db.mongodb import MongoDB
-
-
-class Details:
-    _to_db = None
-    _to_db_xs = None
-    db_name = 'mgp_list'
-    # 定义mongo链接
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-        return self._to_db
-
-    @property
-    def to_db_xs(self):
-        if not self._to_db_xs:
-            self._to_db_xs = MongoDB(port=27001,db='editor')
-        return self._to_db_xs
-    def main(self,page):
-        menus_list = []
-        data = self.to_db_xs.find("luaconfig",{"modifyuser":"maguopeng","param_common":{"$elemMatch": {"$regex": "广东省政府采购网", "$options": "$i"}}})
-        # print(data)
-        for item in data:
-            # print(item)
-            channls = item.get("param_common")[2]
-            code = item.get("code")
-            href = item.get("param_common")[11]
-            print("Menu"+"(",f"'{channls}',",f"'{code}',\n",f"'{href}',",page,"),")
-        #     menus_list.append(f'''Menu({channls},{code},{href},{page})''')
-        # print(menus_list)
-
-Details().main(2)

+ 11 - 15
FworkSpider/untils/execptions.py

@@ -1,19 +1,15 @@
+class PySpiderError(Exception):
 
 
-class CustomCheckError(Exception):
-
-    def __init__(self, code: int = 10002, reason: str = '特征条件检查失败'):
-        self.code = code
-        self.reason = reason
-
-
-class AttachmentNullError(Exception):
-
-    def __init__(self, code: int = 10004, reason: str = '附件下载失败'):
-        self.code = code
-        self.reason = reason
+    def __init__(self, *args, **kwargs):
+        if 'code' not in kwargs and 'reason' not in kwargs:
+            kwargs['code'] = 10000
+            kwargs['reason'] = '未知爬虫错误,请手动处理'
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+        super(PySpiderError, self).__init__(*args, kwargs)
 
 
 
 
-class CustomAccountPrivilegeError(Exception):
+class AttachmentNullError(PySpiderError):
 
 
-    def __init__(self, *args, **kwargs):
-        pass
+    def __init__(self, code: int = 10004, reason: str = '附件下载异常'):
+        super(AttachmentNullError, self).__init__(code=code, reason=reason)

+ 129 - 12
FworkSpider/untils/get_imgcode.py

@@ -1,21 +1,138 @@
 import requests
 import requests
-from typing import Mapping
 
 
+__all__ = [
+    "swordfish_platform",
+    "chaojiying_platform",
+    "chaojiying_report",
+    "get_code",
+    "get_code_det",
+    "arithmetic_captcha",
+]
 
 
-def get_code(file_path: str) -> dict:
-    upload_address = "http://123.57.163.80:2119/v1/images/verify"
-    with open(file_path, 'rb') as f:
+headers = {"accept": "application/json"}
+
+
+def _pack_file(file):
+    """包装验证码格式"""
+    if isinstance(file, str) and file.startswith("data:image"):
+        img_file = {"file": file}
+    elif isinstance(file, bytes):
+        img_file = {"file": file}
+    else:
+        with open(file, "rb") as f:
+            img_bytes = f.read()
+        img_file = {"file": img_bytes}
+    return img_file
+
+
+def _simple_captcha(file):
+    """
+    普通验证码
+
+    @param file: 验证码 - 可以是图片或者图片base64编码
+    @return:
+    """
+    url = "http://123.57.163.80:2119/v1/images/verify"
+    files = _pack_file(file)
+    r = requests.post(url, headers=headers, files=files, stream=True, timeout=10)
+    rp_json = r.json()
+    if "msg" in rp_json and "success" == rp_json["msg"]:
+        return str(rp_json["r"]["code"]).upper()
+    return None
+
+
+def _arithmetic_captcha(file):
+    """算术验证码"""
+    url = "http://123.57.163.80:2119/v1/images/arithmetic"
+    files = _pack_file(file)
+    r = requests.post(url, headers=headers, files=files, stream=True, timeout=10)
+    json_resp = r.json()
+    if "msg" in json_resp and "success" == json_resp["msg"]:
+        return str(json_resp["r"]["code"]).upper()
+    return None
+
+
+def _get_click_verify_captcha(file):
+    """点触式验证码"""
+    url = "http://123.57.163.80:2119/v1/images/verify_det"
+    files = _pack_file(file)
+    r = requests.post(url, headers=headers, files=files, stream=True, timeout=10)
+    return r.json()
+
+
+def swordfish_platform(file, mode="simple"):
+    """剑鱼验证码识别平台"""
+    if mode.lower() == "arithmetic":
+        return _arithmetic_captcha(file)
+    elif mode.lower() == "det":
+        return _get_click_verify_captcha(file)
+    else:
+        return _simple_captcha(file)
+
+
+def chaojiying_platform(file, pic_type: int):
+    """
+    超级鹰识别平台
+
+    pic_type,详情查询地址: https://www.chaojiying.com/price.html
+    """
+    with open(file, 'rb') as f:
         image_bytes = f.read()
         image_bytes = f.read()
-    content = {'file': image_bytes}
-    # json_resp = get_verify_code(upload_address, content)
+    files = {'file': image_bytes}
+
+    url = f"http://123.57.163.80:2119/v1/images/discern?pic_type={pic_type}"
     headers = {'accept': 'application/json'}
     headers = {'accept': 'application/json'}
-    response = requests.post(upload_address, headers=headers, files=content, stream=True)
+    data = {
+        'grant_type': '',
+        'username': 'jianyu001',
+        'password': '123qwe!A',
+        'scope': '',
+        'client_id': '',
+        'client_secret': ''
+    }
+    response = requests.post(url, headers=headers, data=data, files=files, timeout=10)
+    json_resp = response.json()
+    # print(json_resp)
+    '''code 返回0时,打码平台正常返回数据'''
+    pic_str = json_resp["r"]["pic_str"]
+    pic_id = json_resp["r"]["pic_id"]
+    print("pic_id >>", pic_id)
+    if 0 == json_resp["code"]:
+        return pic_str
+
+
+def chaojiying_report(pic_id: str):
+    """超级鹰平台识别验证码错误时,提交识别错误的验证码pic_id"""
+    url = f"http://123.57.163.80:2119/v1/images/report_err?pic_id={pic_id}"
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/x-www-form-urlencoded'
+    }
+    data = {
+        'grant_type': '',
+        'username': 'jianyu001',
+        'password': '123qwe!A',
+        'scope': '',
+        'client_id': '',
+        'client_secret': ''
+    }
+    response = requests.post(url, headers=headers, data=data, timeout=10)
+    '''
+    回调成功:{'msg': 'OK', 'code': 0}  
+    此接口不能随便调用!程序逻辑里要这样判断: 如果 识别结果是错的 再调用 报错返分 接口。 如果没有这个判断或是无法判断,就不要调用!
+    '''
+    # print(response.json())
     return response.json()
     return response.json()
 
 
+
+def get_code(file_path: str) -> dict:
+    return swordfish_platform(file_path) or {}
+
+
 def get_code_det(image_bytes) -> dict:
 def get_code_det(image_bytes) -> dict:
-   upload_address = "http://123.57.163.80:2119/v1/images/verify_det"
-   content = {'image_content': image_bytes}
-   headers = {'accept': 'application/json'}
-   response = requests.post(upload_address, headers=headers, files=content, stream=True)
-   return response.json()
+    return swordfish_platform(image_bytes, mode="det")
+
 
 
+# 算术
+def arithmetic_captcha(image_stream):
+    return swordfish_platform(image_stream, mode="arithmetic")

+ 2 - 762
FworkSpider/untils/proxy_pool.py

@@ -1,763 +1,3 @@
-# coding:utf8
-"""
-代理池
-"""
-import datetime
-import json
-import os
-import random
-import socket
-import time
-from urllib import parse
+from feapder.network.proxy_pool import ProxyPool
 
 
-import redis
-import requests
-
-from feapder import setting
-from feapder.utils import tools
-from feapder.utils.log import log
-
-
-def decrypt(input_str: str) -> str:
-    """
-    改写:新增
-    定义base64解密函数
-
-    :param input_str:
-    :return:
-    """
-    key = "ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/"
-    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
-    output_str = ''
-    # 对前面不是“=”的字节取索引,然后转换为2进制
-    # 补齐“=”的个数
-    equal_num = input_str.count('=')
-    while ascii_list:
-        temp_list = ascii_list[:4]
-        # 转换成2进制字符串
-        temp_str = ''.join(temp_list)
-        # 对没有8位2进制的字符串补够8位2进制
-        if len(temp_str) % 8 != 0:
-            temp_str = temp_str[0:-1 * equal_num * 2]
-        # 4个6字节的二进制  转换  为三个8字节的二进制
-        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
-        # 二进制转为10进制
-        temp_str_list = [int(x, 2) for x in temp_str_list if x]
-        # 连接成字符串
-        output_str += ''.join([chr(x) for x in temp_str_list])
-        ascii_list = ascii_list[4:]
-    return output_str
-
-
-# 建立本地缓存代理文件夹
-proxy_path = os.path.join(os.path.dirname(__file__), "proxy_file")
-if not os.path.exists(proxy_path):
-    os.mkdir(proxy_path)
-
-
-# def get_proxies_by_host(host, port):
-#     proxy_id = "{}:{}".format(host, port)
-#     return get_proxies_by_id(proxy_id)
-
-
-# def get_proxies_by_id(proxy_id):
-#     proxies = {
-#         "http": "http://{}".format(proxy_id),
-#         "https": "https://{}".format(proxy_id),
-#     }
-#     return proxies
-
-
-def get_proxy_from_url(**kwargs):
-    """
-    获取指定url的代理
-    :param kwargs:
-    :return:
-    """
-    proxy_source_url = kwargs.get("proxy_source_url", [])
-    # proxy_source_url = "http://socks.spdata.jianyu360.com/socks/getips?limit=100"
-
-    if not isinstance(proxy_source_url, list):
-        proxy_source_url = [proxy_source_url]
-        proxy_source_url = [x for x in proxy_source_url if x]
-    if not proxy_source_url:
-        raise ValueError("no specify proxy_source_url: {}".format(proxy_source_url))
-    kwargs = kwargs.copy()
-    kwargs.pop("proxy_source_url")
-    proxies_list = []
-    for url in proxy_source_url:
-        if url.startswith("http"):
-            proxies_list.extend(get_proxy_from_http(url, **kwargs))
-        elif url.startswith("redis"):
-            proxies_list.extend(get_proxy_from_redis(url, **kwargs))
-
-    if proxies_list:
-        # 顺序打乱
-        random.shuffle(proxies_list)
-    return proxies_list
-
-
-def get_proxy_from_http(proxy_source_url, **kwargs):
-    """
-    从指定 http 地址获取代理
-    :param proxy_source_url:
-    :param kwargs:
-    :return:
-    """
-    filename = tools.get_md5(proxy_source_url) + ".txt"
-    abs_filename = os.path.join(proxy_path, filename)
-    update_interval = kwargs.get("local_proxy_file_cache_timeout", 30)
-    update_flag = 0
-    if not update_interval:
-        # 强制更新
-        update_flag = 1
-    elif not os.path.exists(abs_filename):
-        # 文件不存在则更新
-        update_flag = 1
-    elif time.time() - os.stat(abs_filename).st_mtime > update_interval:
-        # 超过更新间隔
-        update_flag = 1
-    if update_flag:
-        pool = []
-        response = requests.get(proxy_source_url, timeout=20)
-        # 改写:获取scocks代理的response处理
-        for proxy in response.json():
-            host = decrypt(proxy['host'])
-            port = proxy['port']
-            endTime = proxy['EndTime']
-            pool.append(f"{host}:{port}&&{endTime}")
-
-        with open(os.path.join(proxy_path, filename), "w") as f:
-            f.write('\n'.join(pool))
-    return get_proxy_from_file(filename)
-
-
-def get_proxy_from_file(filename, **kwargs):
-    """
-    从指定本地文件获取代理
-        文件格式
-        ip:port:https
-        ip:port:http
-        ip:port
-    :param filename:
-    :param kwargs:
-    :return:
-    """
-    proxies_list = []
-    with open(os.path.join(proxy_path, filename), "r") as f:
-        lines = f.readlines()
-
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        # 解析
-        auth = ""
-        if "@" in line:
-            auth, line = line.split("@")
-        # 改写,解析代理有效期结束时间
-        line, end = line.split("&&")
-
-        items = line.split(":")
-        if len(items) < 2:
-            continue
-
-        ip, port, *protocol = items
-        if not all([port, ip]):
-            continue
-        if auth:
-            ip = "{}@{}".format(auth, ip)
-        if not protocol:
-            # 改写:判断代理是否在有效期内,并将代理格式重http格式改成socks格式
-            if time.time() < int(end):
-                proxies = {
-                    "https": "socks5://%s:%s" % (ip, port),
-                    "http": "socks5://%s:%s" % (ip, port),
-                    # "end":end
-                }
-            else:
-                continue
-        else:
-            proxies = {protocol[0]: "%s://%s:%s" % (protocol[0], ip, port)}
-        proxies_list.append(proxies)
-
-    return proxies_list
-
-
-def get_proxy_from_redis(proxy_source_url, **kwargs):
-    """
-    从指定 redis 地址获取代理
-    @param proxy_source_url: redis://:passwd@host:ip/db
-        redis 存储结构 zset
-        ip:port ts
-    @param kwargs:
-        {"redis_proxies_key": "xxx"}
-    @return: [{'http':'http://xxx.xxx.xxx:xxx', 'https':'https://xxx.xxx.xxx.xxx:xxx'}]
-    """
-
-    redis_conn = redis.StrictRedis.from_url(proxy_source_url)
-    key = kwargs.get("redis_proxies_key")
-    assert key, "从redis中获取代理 需要指定 redis_proxies_key"
-    proxies = redis_conn.zrange(key, 0, -1)
-    proxies_list = []
-    for proxy in proxies:
-        proxy = proxy.decode()
-        proxies_list.append(
-            {"https": "https://%s" % proxy, "http": "http://%s" % proxy}
-        )
-    return proxies_list
-
-
-def check_proxy(
-        ip="",
-        port="",
-        proxies=None,
-        type=0,
-        timeout=5,
-        logger=None,
-        show_error_log=True,
-        **kwargs,
-):
-    """
-    代理有效性检查
-    :param ip:
-    :param port:
-    :param type: 0:socket  1:requests
-    :param timeout:
-    :param logger:
-    :return:
-    """
-    if not logger:
-        logger = log
-    ok = 0
-    if type == 0 and ip and port:
-        # socket检测成功 不代表代理一定可用 Connection closed by foreign host. 这种情况就不行
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk:
-            sk.settimeout(timeout)
-            try:
-                # 必须检测 否则代理永远不刷新
-                sk.connect((ip, int(port)))
-                ok = 1
-            except Exception as e:
-                if show_error_log:
-                    logger.debug("check proxy failed: {} {}:{}".format(e, ip, port))
-            sk.close()
-    else:
-        if not proxies:
-            proxies = {
-                "http": "socks5://{}:{}".format(ip, port),
-                "https": "socks5//{}:{}".format(ip, port),
-            }
-        try:
-            # 改写:代理检测的url
-            r = requests.get(
-                "https://myip.ipip.net", proxies=proxies, timeout=timeout, stream=True
-            )
-            ok = 1
-            r.close()
-        except Exception as e:
-            if show_error_log:
-                logger.debug(
-                    "check proxy failed: {} {}:{} {}".format(e, ip, port, proxies)
-                )
-    return ok
-
-
-class ProxyItem(object):
-    """单个代理对象"""
-
-    # 代理标记
-    proxy_tag_list = (-1, 0, 1)
-
-    def __init__(
-            self,
-            proxies=None,
-            valid_timeout=20,
-            check_interval=180,
-            max_proxy_use_num=10000,
-            delay=30,
-            use_interval=None,
-            **kwargs,
-    ):
-        """
-        :param proxies:
-        :param valid_timeout:  代理检测超时时间 默认-1    20181008  默认不再监测有效性
-        :param check_interval:
-        :param max_proxy_use_num:
-        :param delay:
-        :param use_interval: 使用间隔 单位秒 默认不限制
-        :param logger: 日志处理器 默认 log.get_logger()
-        :param kwargs:
-        """
-        # {"http": ..., "https": ...}
-        self.proxies = proxies
-        # 检测超时时间 秒
-        self.valid_timeout = valid_timeout
-        # 检测间隔 秒
-        self.check_interval = check_interval
-
-        # 标记  0:正常 -1:丢弃  1: 待会再用 ...
-        self.flag = 0
-        # 上次状态变化时间
-        self.flag_ts = 0
-        # 上次更新时间 有效时间
-        self.update_ts = 0
-        # 最大被使用次数
-        self.max_proxy_use_num = max_proxy_use_num
-        # 被使用次数记录
-        self.use_num = 0
-        # 延迟使用时间
-        self.delay = delay
-        # 使用间隔 单位秒
-        self.use_interval = use_interval
-        # 使用时间
-        self.use_ts = 0
-
-        self.proxy_args = self.parse_proxies(self.proxies)
-        self.proxy_ip = self.proxy_args["ip"]
-        self.proxy_port = self.proxy_args["port"]
-        self.proxy_ip_port = "{}:{}".format(self.proxy_ip, self.proxy_port)
-        if self.proxy_args["user"]:
-            self.proxy_id = "{user}:{password}@{ip}:{port}".format(**self.proxy_args)
-        else:
-            self.proxy_id = self.proxy_ip_port
-
-        # 日志处理器
-        self.logger = log
-
-    def get_proxies(self):
-        self.use_num += 1
-        return self.proxies
-
-    def is_delay(self):
-        return self.flag == 1
-
-    def is_valid(self, force=0, type=0):
-        """
-        检测代理是否有效
-            1 有效
-            2 延时使用
-            0 无效 直接在代理池删除
-        :param force:
-        :param type:
-        :return:
-        """
-        if self.use_num > self.max_proxy_use_num > 0:
-            self.logger.debug("代理达到最大使用次数: {} {}".format(self.use_num, self.proxies))
-            return 0
-        if self.flag == -1:
-            self.logger.debug("代理被标记 -1 丢弃 %s" % self.proxies)
-            return 0
-        if self.delay > 0 and self.flag == 1:
-            if time.time() - self.flag_ts < self.delay:
-                self.logger.debug("代理被标记 1 延迟 %s" % self.proxies)
-                return 2
-            else:
-                self.flag = 0
-                self.logger.debug("延迟代理释放: {}".format(self.proxies))
-        if self.use_interval:
-            if time.time() - self.use_ts < self.use_interval:
-                return 2
-        if not force:
-            if time.time() - self.update_ts < self.check_interval:
-                return 1
-        if self.valid_timeout > 0:
-            ok = check_proxy(
-                proxies=self.proxies,
-                type=type,
-                timeout=self.valid_timeout,
-                logger=self.logger,
-            )
-        else:
-            ok = 1
-        self.update_ts = time.time()
-        return ok
-
-    @classmethod
-    def parse_proxies(self, proxies):
-        """
-        分解代理组成部分
-        :param proxies:
-        :return:
-        """
-        if not proxies:
-            return {}
-        if isinstance(proxies, (str, bytes)):
-            proxies = json.loads(proxies)
-        protocol = list(proxies.keys())
-        if not protocol:
-            return {}
-        _url = proxies.get(protocol[0])
-        # 改写:注释http代理url的拼接,以正常生成代理池
-        # if not _url.startswith("http"):
-        #     _url = "http://" + _url
-        _url_parse = parse.urlparse(_url)
-        netloc = _url_parse.netloc
-        if "@" in netloc:
-            netloc_auth, netloc_host = netloc.split("@")
-        else:
-            netloc_auth, netloc_host = "", netloc
-        ip, *port = netloc_host.split(":")
-        port = port[0] if port else "80"
-        user, *password = netloc_auth.split(":")
-        password = password[0] if password else ""
-        return {
-            "protocol": protocol,
-            "ip": ip,
-            "port": port,
-            "user": user,
-            "password": password,
-            "ip_port": "{}:{}".format(ip, port),
-        }
-
-
-class ProxyPoolBase(object):
-    def __init__(self, *args, **kwargs):
-        pass
-
-    def get(self, *args, **kwargs):
-        raise NotImplementedError
-
-
-class ProxyPool(ProxyPoolBase):
-    """代理池"""
-
-    def __init__(self, **kwargs):
-        """
-        :param size: 代理池大小  -1 为不限制
-        :param proxy_source_url: 代理文件地址 支持列表
-        :param proxy_instance:  提供代理的实例
-        :param reset_interval:  代理池重置间隔 最小间隔
-        :param reset_interval_max:  代理池重置间隔 最大间隔 默认2分钟
-        :param check_valid: 是否在获取代理时进行检测有效性
-        :param local_proxy_file_cache_timeout: 本地缓存的代理文件超时时间
-        :param logger: 日志处理器 默认 log.get_logger()
-        :param kwargs: 其他的参数
-        """
-        kwargs.setdefault("size", -1)
-        kwargs.setdefault("proxy_source_url", setting.PROXY_EXTRACT_API)
-
-        super(ProxyPool, self).__init__(**kwargs)
-        # 队列最大长度
-        self.max_queue_size = kwargs.get("size", -1)
-        # 实际代理数量
-        self.real_max_proxy_count = 1000
-        # 代理可用最大次数
-        # 代理获取地址 http://localhost/proxy.txt
-        self.proxy_source_url = kwargs.get("proxy_source_url", [])
-        if not isinstance(self.proxy_source_url, list):
-            self.proxy_source_url = [self.proxy_source_url]
-            self.proxy_source_url = [x for x in self.proxy_source_url if x]
-            self.proxy_source_url = list(set(self.proxy_source_url))
-            kwargs.update({"proxy_source_url": self.proxy_source_url})
-        # 处理日志
-        self.logger = kwargs.get("logger") or log
-        kwargs["logger"] = self.logger
-        if not self.proxy_source_url:
-            self.logger.warn("need set proxy_source_url or proxy_instance")
-
-        # 代理池重置间隔
-        self.reset_interval = kwargs.get("reset_interval", 5)
-        # 强制重置一下代理 添加新的代理进来 防止一直使用旧的被封的代理
-        self.reset_interval_max = kwargs.get("reset_interval_max", 180)
-        # 是否监测代理有效性
-        self.check_valid = kwargs.get("check_valid", True)
-
-        # 代理队列
-        self.proxy_queue = None
-        # {代理id: ProxyItem, ...}
-        self.proxy_dict = {}
-        # 失效代理队列
-        self.invalid_proxy_dict = {}
-
-        self.kwargs = kwargs
-
-        # 重置代理池锁
-        self.reset_lock = None
-        # 重置时间
-        self.last_reset_time = 0
-        # 重置的太快了  计数
-        self.reset_fast_count = 0
-        # 计数 获取代理重试3次仍然失败 次数
-        self.no_valid_proxy_times = 0
-
-        # 上次获取代理时间
-        self.last_get_ts = time.time()
-
-        # 记录ProxyItem的update_ts 防止由于重置太快导致重复检测有效性
-        self.proxy_item_update_ts_dict = {}
-
-        # 警告
-        self.warn_flag = False
-
-    def warn(self):
-        if not self.warn_flag:
-            for url in self.proxy_source_url:
-                if "zhima" in url:
-                    continue
-            self.warn_flag = True
-        return
-
-    @property
-    def queue_size(self):
-        """
-        当前代理池中代理数量
-        :return:
-        """
-        return self.proxy_queue.qsize() if self.proxy_queue is not None else 0
-
-    def clear(self):
-        """
-        清空自己
-        :return:
-        """
-        self.proxy_queue = None
-        # {代理ip: ProxyItem, ...}
-        self.proxy_dict = {}
-        # 清理失效代理集合
-        _limit = datetime.datetime.now() - datetime.timedelta(minutes=10)
-        self.invalid_proxy_dict = {
-            k: v for k, v in self.invalid_proxy_dict.items() if v > _limit
-        }
-        # 清理超时的update_ts记录
-        _limit = time.time() - 600
-        self.proxy_item_update_ts_dict = {
-            k: v for k, v in self.proxy_item_update_ts_dict.items() if v > _limit
-        }
-        return
-
-    def get(self, retry: int = 0) -> dict:
-        """
-        从代理池中获取代理
-        :param retry:
-        :return:
-        """
-        retry += 1
-        if retry > 3:
-            self.no_valid_proxy_times += 1
-            return None
-        # if time.time() - self.last_get_ts > 3 * 60:
-        #     # 3分钟没有获取过 重置一下
-        #     try:
-        #         self.reset_proxy_pool()
-        #     except Exception as e:
-        #         self.logger.exception(e)
-        # 记录获取时间
-        self.last_get_ts = time.time()
-        #
-        self.warn()
-        proxy_item = self.get_random_proxy()
-        if proxy_item:
-            # 不检测
-            if not self.check_valid:  #
-                # 塞回去
-                proxies = proxy_item.get_proxies()
-                self.put_proxy_item(proxy_item)
-                return proxies
-            else:
-                is_valid = proxy_item.is_valid()
-                if is_valid:
-                    # 记录update_ts
-                    self.proxy_item_update_ts_dict[
-                        proxy_item.proxy_id
-                    ] = proxy_item.update_ts
-                    # 塞回去
-                    proxies = proxy_item.get_proxies()
-                    self.put_proxy_item(proxy_item)
-                    if is_valid == 1:
-                        if proxy_item.use_interval:
-                            proxy_item.use_ts = time.time()
-                        return proxies
-                else:
-                    # 处理失效代理
-                    self.proxy_dict.pop(proxy_item.proxy_id, "")
-                    self.invalid_proxy_dict[
-                        proxy_item.proxy_id
-                    ] = datetime.datetime.now()
-        else:
-            try:
-                time.sleep(3)
-                self.reset_proxy_pool()
-            except Exception as e:
-                self.logger.exception(e)
-        if self.no_valid_proxy_times >= 5:
-            # 解决bug: 当爬虫仅剩一个任务时 由于只有一个线程检测代理 而不可用代理又刚好很多(时间越长越多) 可能出现一直获取不到代理的情况
-            # 导致爬虫烂尾
-            try:
-                time.sleep(3)
-                self.reset_proxy_pool()
-            except Exception as e:
-                self.logger.exception(e)
-        return self.get(retry)
-
-    get_proxy = get
-
-    def get_random_proxy(self) -> ProxyItem:
-        """
-        随机获取代理
-        :return:
-        """
-        if self.proxy_queue is not None:
-            if random.random() < 0.5:
-                # 一半概率检查 这是个高频操作 优化一下
-                if time.time() - self.last_reset_time > self.reset_interval_max:
-                    time.sleep(3)
-                    self.reset_proxy_pool(force=True)
-                else:
-                    min_q_size = (
-                        min(self.max_queue_size / 2, self.real_max_proxy_count / 2)
-                        if self.max_queue_size > 0
-                        else self.real_max_proxy_count / 2
-                    )
-                    if self.proxy_queue.qsize() < min_q_size:
-                        time.sleep(3)
-                        self.reset_proxy_pool()
-            try:
-                return self.proxy_queue.get_nowait()
-            except Exception:
-                pass
-        return None
-
-    def append_proxies(self, proxies_list: list) -> int:
-        """
-        添加代理到代理池
-        :param proxies_list:
-        :return:
-        """
-        count = 0
-        if not isinstance(proxies_list, list):
-            proxies_list = [proxies_list]
-        for proxies in proxies_list:
-            if proxies:
-                proxy_item = ProxyItem(proxies=proxies, **self.kwargs)
-                # 增加失效判断 2018/12/18
-                if proxy_item.proxy_id in self.invalid_proxy_dict:
-                    continue
-                if proxy_item.proxy_id not in self.proxy_dict:
-                    # 补充update_ts
-                    if not proxy_item.update_ts:
-                        proxy_item.update_ts = self.proxy_item_update_ts_dict.get(
-                            proxy_item.proxy_id, 0
-                        )
-                    self.put_proxy_item(proxy_item)
-                    self.proxy_dict[proxy_item.proxy_id] = proxy_item
-                    count += 1
-        return count
-
-    def put_proxy_item(self, proxy_item: ProxyItem):
-        """
-        添加 ProxyItem 到代理池
-        :param proxy_item:
-        :return:
-        """
-        return self.proxy_queue.put_nowait(proxy_item)
-
-    def reset_proxy_pool(self, force: bool = False):
-        """
-        重置代理池
-        :param force: 是否强制重置代理池
-        :return:
-        """
-        if not self.reset_lock:
-            # 必须用时调用 否则 可能存在 gevent patch前 threading就已经被导入 导致的Rlock patch失效
-            import threading
-
-            self.reset_lock = threading.RLock()
-        with self.reset_lock:
-            if (
-                    force
-                    or self.proxy_queue is None
-                    or (
-                    self.max_queue_size > 0
-                    and self.proxy_queue.qsize() < self.max_queue_size / 2
-            )
-                    or (
-                    self.max_queue_size < 0
-                    and self.proxy_queue.qsize() < self.real_max_proxy_count / 2
-            )
-                    or self.no_valid_proxy_times >= 5
-            ):
-                if time.time() - self.last_reset_time < self.reset_interval:
-                    self.reset_fast_count += 1
-                    if self.reset_fast_count % 10 == 0:
-                        self.logger.debug(
-                            "代理池重置的太快了:) {}".format(self.reset_fast_count)
-                        )
-                        time.sleep(1)
-                else:
-                    self.clear()
-                    if self.proxy_queue is None:
-                        import queue
-
-                        self.proxy_queue = queue.Queue()
-                    # TODO 这里获取到的可能重复
-                    proxies_list = get_proxy_from_url(**self.kwargs)
-                    self.real_max_proxy_count = len(proxies_list)
-                    if 0 < self.max_queue_size < self.real_max_proxy_count:
-                        proxies_list = random.sample(proxies_list, self.max_queue_size)
-                    _valid_count = self.append_proxies(proxies_list)
-                    self.last_reset_time = time.time()
-                    self.no_valid_proxy_times = 0
-                    self.logger.debug(
-                        "重置代理池成功: 获取{}, 成功添加{}, 失效{},  当前代理数{},".format(
-                            len(proxies_list),
-                            _valid_count,
-                            len(self.invalid_proxy_dict),
-                            len(self.proxy_dict),
-                        )
-                    )
-        return
-
-    def tag_proxy(self, proxies_list: list, flag: int, *, delay=30) -> bool:
-        """
-        对代理进行标记
-        :param proxies_list:
-        :param flag:
-                    -1  废弃
-                    1 延迟使用
-        :param delay: 延迟时间
-        :return:
-        """
-        if int(flag) not in ProxyItem.proxy_tag_list or not proxies_list:
-            return False
-        if not isinstance(proxies_list, list):
-            proxies_list = [proxies_list]
-        for proxies in proxies_list:
-            if not proxies:
-                continue
-            proxy_id = ProxyItem(proxies).proxy_id
-            if proxy_id not in self.proxy_dict:
-                continue
-            self.proxy_dict[proxy_id].flag = flag
-            self.proxy_dict[proxy_id].flag_ts = time.time()
-            self.proxy_dict[proxy_id].delay = delay
-
-        return True
-
-    def get_proxy_item(self, proxy_id="", proxies=None):
-        """
-        获取代理对象
-        :param proxy_id:
-        :param proxies:
-        :return:
-        """
-        if proxy_id:
-            return self.proxy_dict.get(proxy_id)
-        if proxies:
-            proxy_id = ProxyItem(proxies).proxy_id
-            return self.proxy_dict.get(proxy_id)
-        return
-
-    def copy(self):
-        return ProxyPool(**self.kwargs)
-
-    def all(self) -> list:
-        """
-        获取当前代理池中的全部代理
-        :return:
-        """
-        return get_proxy_from_url(**self.kwargs)
-
-
-if __name__ == '__main__':
-    ProxyPool().get()
+__all__ = ["ProxyPool"]

+ 186 - 123
FworkSpider/untils/tools.py

@@ -1,24 +1,93 @@
+import copy
+import functools
 import hashlib
 import hashlib
-import json
 import re
 import re
 from collections import namedtuple
 from collections import namedtuple
-import requests
-from setting import WECHAT_WARNING_URL,WECHAT_WARNING_PHONE,WARNING_INTERVAL,WECHAT_WARNING_ALL
+from string import whitespace
+
 import bson
 import bson
-from feapder.utils.log import log
-from feapder.db.mongodb import MongoDB
-from .cleaner import cleaner
-import sys
+import requests
+
+from untils.clean_html import cleaner
 
 
 SearchText = namedtuple('SearchText', ['total'])
 SearchText = namedtuple('SearchText', ['total'])
 
 
 
 
-def substitute(html_str,special=None, completely=False):
+def substitute(html_str, special=None, completely=False):
     """HTML 替换"""
     """HTML 替换"""
-    html_str = cleaner(html=html_str,special=None, completely=False)
+    html_str = cleaner(html=html_str, special=special, completely=completely)
     return html_str
     return html_str
 
 
 
 
+def merge_files(*files):
+    """合并文件"""
+    res = {}
+    for file_ in files:
+        if isinstance(file_, dict):
+            for _, attachment in file_.items():
+                res[str(len(res) + 1)] = attachment
+    return res
+
+
+def is_all_chinese(strs):
+    """检验是否全是中文字符"""
+    for _char in strs:
+        if not '\u4e00' <= _char <= '\u9fa5':
+            return False
+    return True
+
+
+def clean_document(*fields):
+    """
+    清洗mongo文档
+
+    :param fields: 清洗字段
+
+    # 用例:
+    # >>> clean_document('dzr')(lambda *args, **kw: None)(document)
+    """
+
+    def clean(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            defaults = {
+                "_id",
+                "parser_name", "parser_url", "request_params",
+                "failed", "error"
+            }
+            removes = defaults if not fields else {*defaults, *fields}
+            item = args[0] if not kwargs else kwargs
+            data_dict = item if isinstance(item, dict) else item.to_dict
+            copy_data_dict = copy.deepcopy(data_dict)
+            for k in copy_data_dict.keys():
+                if k in removes:
+                    del data_dict[k]
+                    try:
+                        delattr(item, k)  # 删除 Item 类实例属性
+                    except AttributeError:
+                        pass
+            return func(*args, **kwargs)
+
+        return wrapper
+
+    return clean
+
+
+def clean_chars(text, charsets=whitespace):
+    """
+    按照字符集,删除字符
+
+    :param str text: 文本
+    :param charsets: 字符集
+    :return: 干净的文本
+    """
+    if text is not None:
+        for char in charsets:
+            if char in text:
+                text = text.replace(char, '')
+    return text
+
+
 def get_signature(content: str) -> str:
 def get_signature(content: str) -> str:
     """
     """
     十六进制数字字符串形式摘要值
     十六进制数字字符串形式摘要值
@@ -31,6 +100,15 @@ def get_signature(content: str) -> str:
     return sha1.hexdigest()
     return sha1.hexdigest()
 
 
 
 
+def get_md5(val):
+    md5 = hashlib.md5()
+    if isinstance(val, bytes):
+        md5.update(str(val).encode("utf-8"))
+    elif isinstance(val, str):
+        md5.update(val.encode("utf-8"))
+    return md5.hexdigest()
+
+
 def text_search(content: str) -> SearchText:
 def text_search(content: str) -> SearchText:
     """
     """
     中文检索
     中文检索
@@ -50,115 +128,93 @@ def int2long(param: int):
     """int 转换成 long """
     """int 转换成 long """
     return bson.int64.Int64(param)
     return bson.int64.Int64(param)
 
 
-def get_spiders(menus):
-    db = MongoDB(db="editor")
-    for menu in menus:
-        spider_info = db.find('luaconfig',{"code":menu.code})
-        if len(spider_info) >0:
-            if spider_info[0].get("state") not in (11,):
-                menus.remove(menu)
-
-def wechat_warning(
-    message,
-    message_prefix=None,
-    rate_limit=None,
-    url=None,
-    user_phone=None,
-    all_users: bool = None,
-):
-    """企业微信报警"""
-
-    # 为了加载最新的配置
-    rate_limit = rate_limit if rate_limit is not None else WARNING_INTERVAL
-    url = url or WECHAT_WARNING_URL
-    user_phone = user_phone or WECHAT_WARNING_PHONE
-    all_users = all_users if all_users is not None else WECHAT_WARNING_ALL
-
-    if isinstance(user_phone, str):
-        user_phone = [user_phone] if user_phone else []
-
-    if all_users is True or not user_phone:
-        user_phone = ["@all"]
-
-    if not all([url, message]):
-        return
-
-    data = {
-        "msgtype": "text",
-        "text": {"content": message, "mentioned_mobile_list": user_phone},
-    }
 
 
-    headers = {"Content-Type": "application/json"}
+def njpc_hpsj_filt_keywords(text: str, special_kw=None):
+    if special_kw is None:
+        special_kw = {}
+
+    keywords = {'项目', '工程', '验收', '评价', *special_kw}
 
 
-    try:
-        response = requests.post(
-            url, headers=headers, data=json.dumps(data).encode("utf8")
-        )
-        result = response.json()
-        response.close()
-        if result.get("errcode") == 0:
-            return True
+    for keyword in keywords:
+        result = re.match(f'.*{keyword}', text, re.S)
+        if result is not None:
+            return True  # 需要采集
+    else:
+        return False     # 丢弃
+
+
+# 拟建爬虫字段正则抽取
+def njpc_fields_extract(html, data_item, is_clean=False):
+    """
+        拟建爬虫字段正则抽取
+    :param str html: 页面源码
+    :param Items data_item: 详情页item
+    :param bool is_clean: 是否对源码进行清洗
+    :return:
+    """
+    if is_clean:
+        html = substitute(html)
+
+    data_item.title = data_item.projectname
+    projectname = re.findall('项目名称(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    approvecode = re.findall('项目代码(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    approvecontent = re.findall('(?:事项名称|审批事项)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    owner = re.findall('建设(?:单位|单位名称)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    projectaddr = re.findall('建设(?:地点|地址)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    total_investment = re.findall('总投资(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    project_person = re.findall('联系人(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    project_phone = re.findall('联系(?:电话|方式)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    approvedept = re.findall('审批部门(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    approvenumber = re.findall('(?:审批|批准)文号(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    approvetime = re.findall('审批时间(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    project_scale = "".join(re.findall('建设(?:内容|内容[及|与|和]规模|规模|规模[及|与|和]内容)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S))
+    project_completedate = re.findall('竣工日期(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+
+    if project_scale:
+        construction_area = search('[总]*\S*建筑[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
+        floor_area = search('[总]*\S*占地[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
+        if not construction_area:
+            construction_area = ""
         else:
         else:
-            raise Exception(result.get("errmsg"))
-    except Exception as e:
-        log.error("报警发送失败。 报警内容 {}, error: {}".format(message, e))
-        return False
-
-class JyBasicException(Exception):
-
-    def __init__(self, code: int, reason: str, **kwargs):
-        self.code = code
-        self.reason = reason
-        self.err_details = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-
-class CustomCheckError(JyBasicException):
-
-    def __init__(self, code: int = 10002, reason: str = '特征条件检查异常', **kwargs):
-        self.code = code
-        self.reason = reason
-        self.err_details = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-class HtmlEmptyError(JyBasicException):
-
-    def __init__(self, code: int = 10002, reason: str = '正文获取异常,正文为空', **kwargs):
-        self.code = code
-        self.reason = reason
-        self.err_details = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-
-class CheckPrePareRequest:
-
-    def __init__(self):
-        self.crawl_keywords = {
-            '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询',
-            '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选',
-            '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价',
-            '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正',
-            '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑',
-            '终止', '系统'
-        }
-
-    def check_crawl_title(self, title: str):
-        for keyword in self.crawl_keywords:
-            valid_keyword = re.search(keyword, title)
-            if valid_keyword is not None:
-                break
+            construction_area = re.sub(":|:", "", construction_area)
+
+        if not floor_area:
+            floor_area = ""
         else:
         else:
-            # raise CustomCheckError(code=10106, reason='标题未检索到采集关键词', title=title)
-            return 10106,'标题未检索到采集关键词'
-        return 200,'ok'
+            floor_area = re.sub(":|:", "", floor_area)
+
+        data_item.project_scale = project_scale
+        data_item.project_scale_info = {
+            "construction_area": construction_area,
+            "floor_area": floor_area,
+        }  # 建设规模及主要内容
+
+    fields_dict = {
+        "projectname": projectname,
+        "owner": owner,
+        "total_investment": total_investment,
+        "project_person": project_person,
+        "project_phone": project_phone,
+        "approvedept": approvedept,
+        "approvetime": approvetime,
+        "project_completedate": project_completedate,
+        "projectaddr": projectaddr,
+        "approvecode": approvecode,
+        "approvecontent": approvecontent,
+        "approvenumber": approvenumber
+    }
+    for fields_k, fields_v in fields_dict.items():
+        if fields_v:
+            fields_v[0] = clean_chars(fields_v[0])
+            if not fields_v[0]:
+                continue
 
 
+            data_item[fields_k] = re.sub(
+                r'([,|.|。|)|)|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$',
+                "", fields_v[0])
 
 
-    def __check(self, rows: dict):
-        title, publish_time = rows['title'], rows['l_np_publishtime']
-        self.check_crawl_title(title)
+    return data_item
 
 
-    def __call__(self, rows: dict, *args, **kwargs):
-        self.__check(rows)
 
 
 def get_proxy():
 def get_proxy():
     headers = {
     headers = {
@@ -167,32 +223,39 @@ def get_proxy():
     proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
     proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
     print(f"切换代理:{proxy.get('data')}")
     print(f"切换代理:{proxy.get('data')}")
     return proxy.get("data").get("http")
     return proxy.get("data").get("http")
-import json
 
 
-class Obj(object):
-    def __init__(self, dict_):
-        self.__dict__.update(dict_)
-
-def get_argvs():
-    argvs = {"next_page":False,"max_page":10}
-    for item in sys.argv[1:]:
-        print(item)
-        if item.startswith("--"):
-            argvs[item.replace("--", "").split('=')[0]] = int(item.split('=')[-1])
-    return json.loads(json.dumps(argvs), object_hook=Obj)
 
 
 def search(pattern, string):
 def search(pattern, string):
     result = re.search(pattern, string)
     result = re.search(pattern, string)
     if result:
     if result:
         return result.groups()[0]
         return result.groups()[0]
 
 
+
 def search_construction(string):
 def search_construction(string):
     result = re.search('pattern', string)
     result = re.search('pattern', string)
     if result:
     if result:
         return result.groups()[0]
         return result.groups()[0]
 
 
+
 def search_floor(string):
 def search_floor(string):
     result = re.search('pattern', string)
     result = re.search('pattern', string)
     if result:
     if result:
         return result.groups()[0]
         return result.groups()[0]
 
 
+
+def get_floor_area(project_scale):
+    floor_area = search('[总]*\S*占地[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
+    if not floor_area:
+        floor_area = ""
+    else:
+        floor_area = floor_area.replace(':', '').replace(':', '')
+    return floor_area
+
+
+def get_construction_area(project_scale):
+    construction_area = search('[总]*\S*建筑[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
+    if not construction_area:
+        construction_area = ""
+    else:
+        construction_area = construction_area.replace(':', '').replace(':', '')
+    return construction_area

+ 0 - 0
NoteWork/python乱码识别/__init__.py


+ 0 - 0
NoteWork/文档/img.png


+ 0 - 0
NoteWork/文档/img_1.png


+ 0 - 0
NoteWork/文档/img_10.png


+ 0 - 0
NoteWork/文档/img_11.png


+ 0 - 0
NoteWork/文档/img_12.png


+ 0 - 0
NoteWork/文档/img_13.png


+ 0 - 0
NoteWork/文档/img_2.png


+ 0 - 0
NoteWork/文档/img_3.png


+ 0 - 0
NoteWork/文档/img_4.png


+ 0 - 0
NoteWork/文档/img_5.png


+ 0 - 0
NoteWork/文档/img_6.png


+ 0 - 0
NoteWork/文档/img_7.png


+ 0 - 0
NoteWork/文档/img_8.png


+ 0 - 0
NoteWork/文档/img_9.png


+ 0 - 29
NoteWork/文档/update.md

@@ -1,29 +0,0 @@
-### 1、快照页:已完成
-    附件采集方法        
-### 2、关联lua爬虫接口:已完成
-    待开发爬虫任务管理  #
-### 3、报警修改   爬虫报警规则:已完成
-	1、失败一定次数  
-	2、爬虫当前任务成功率过低
-	3、爬虫导出数据失败一定次数
-	4、爬虫任务停滞
-	5、爬虫异常停止
-```python
-
-
-```
-
-### 4、爬虫校验,同时只运行一个  *无需修改 
-    爬虫为分布式爬虫,后续新建任务不会重新运行,
-	会读取当前爬虫中未完成的任务,协同执行  
-    /// 若两个爬虫同一时间开始执行,这时无法处理 
-
-
-### 5、重新采集的一个字段  :已完成
-    关于正文/其他数据采集为空,这里进行了处理,停止当前管道线程,把其当做错误请求处理,
-	五次容错机会,五次均失败后丢弃当前连接,等待下一轮爬虫执行时重试
-
-### 6、快速定位,项目爬虫代码、指定人员  :可指定人员、获取爬虫名称,但无法直接跳转到爬虫文件 
-        可分层级,每个角色-单独一个爬虫,按地区分文件 以便快速查找爬虫文件
-### 7、管理平台消息自定义  无需改动
-    # 两个消息发送方式:爬虫结束时发送,爬虫异常结束时报错

+ 0 - 108
NoteWork/文档/开发文档.md

@@ -1,108 +0,0 @@
-
-## feapder爬虫开发文档
-#### 本地调试环境安装
-    python环境,python安装
-    redis + mongo
-#### 创建爬虫
-    命令创建  create -s ***** 4
-
-### 编辑爬虫
-1、编辑站点信息、栏目信息等基础等
-```python
-	def start_callback(self):
-		Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-	    self.site= "测试地址平台采集"
-        self.menus = [
-             # Menu('Ceshidizhi抓取栏目', 'Ceshidizhi爬虫code', "自定义参数", 1),
-             # Menu('Ceshidizhi抓取栏目', 'Ceshidizhi爬虫code', "Notice", 1),
-             Menu('政府采购-采购公告', 'hn_ceshidizhi_zfcg_cggg', "zfcg/cggg", 1),
-             Menu('综合其他-中标前公示', 'hn_ceshidizhi_zhqt_zbqgs', "zhqt/zbqgs", 1),
-         ]
-         
-	def start_requests(self):
-    	for menu in self.menus:
-        	for page in range(1,menu.crawl_page+1):
-            	start_url = f'http://www.ceshi.com/{menu.types}'
-	            yield feapder.Request(url=start_url, item=menu._asdict(), proxies=False)
-```
-
-2、根据栏目信息,配置相对应的起始连接,代理ip默认为未启用,如需启用代理,将Proxies修改为True
-
-3、编辑列表页解析的xpath规则/json解析字段
-```python
-    def parse(self, request, response): #xpath方式:
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []  # 不用修改
-        info_list = response.xpath("//table[@class='p1']/tr[position()>1]")
-        for info in info_list:
-            href = info.xpath('./td[1]/a/@href').extract_first().strip()
-            title = info.xpath('./td[1]/a/text()').extract_first().strip()
-            
-            '''保证时间格式为 0000-00-00 00:00:00 或 0000-00-00格式'''
-            create_time = info.xpath('./td[5]/text()').extract_first().strip()
-            
-            '''如果有省市信息,一定要按具体规则解析或切割省市信息'''
-            area = info.xpath('./td[4]/text()').extract_first()
-            city = info.xpath('./td[4]/text()').extract_first()   #城市
-            area = area if area else "全国"   #省份
-            city = city if city else ""   #省份
-            
-    def parse(self, request, response): #json方式:
-        menu = request.item
-        dedup = Dedup(Dedup.BloomFilter)
-        href_list = []
-        info_list = response.json.get("data").get("list")
-        for info in info_list:
-            href = info.get("href")
-            title = info.get("title")
-            crate_time = info.get("create_time")
-            area = info.get("area")
-            city = info.get("city")
-            area = area if area else "全国"  # 省份
-            city = city if city else ""  # 城市
-```
-
-4、编辑详情页解析的xpath规则/部分代码编写
-```python
-    list_item =  MgpListItem()
-    list_item.parse = "self.detail_get"
-    list_item.parser_name = "details"
-    list_item.item = data_item.to_dict
-    list_item.deal_detail ['//div[@class="content"]']
-    list_item.proxies = False
-    list_item.parse_url = href
-    list_item.author = 'mgp' # 自定author,如无author,则根据文件夹的名称自成
-    list_item.pri = 1
-    list.files={
-        "list_xpath":'//div[@class="notice-foot"]/a',
-        "url_xpath":'./@href',
-        "name_xpath":'./text()',
-        "files_type":('zip','doxc','ftp'), # 需要下载的附件类型
-        "file_type":'docx',   # 默认的附件类型,用于url中未带附件类型的
-        "url_key":'http',  # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
-        "host":'http://www.ceshi.com',  # 需要拼接url的host
-            }
-    href_list.append(href)
-    yield list_item
-```
-
-### 部署爬虫
-    1、将编辑好的爬虫放到自己的爬虫文件夹之下,下面是示例
-![在这里插入图片描述](https://img-blog.csdnimg.cn/061efe986db8402bb13b482c8d447f91.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBA56u55LmL56yR,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center)
-
-![在这里插入图片描述](https://img-blog.csdnimg.cn/75d4c7851a2e435cafac29f627faaa4b.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBA56u55LmL56yR,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center)
-
-    2、根据爬虫数据采集量创建定时任务
-![在这里插入图片描述](https://img-blog.csdnimg.cn/227f32935f8e4f4fa6b19bea96805b37.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBA56u55LmL56yR,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center)
-![在这里插入图片描述](https://img-blog.csdnimg.cn/3f4e2bffe2e042eca0cbc35b99817f81.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBA56u55LmL56yR,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center)
-
-
-
-    3、创建好定时任务后点击启用即可
-
-![在这里插入图片描述](https://img-blog.csdnimg.cn/ffe8e2ec981d4f798b7efa44406926be.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBA56u55LmL56yR,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center)
-
-
-
-

+ 1 - 2
README.md

@@ -1,2 +1 @@
-# 
-
+# 

Some files were not shown because too many files changed in this diff