Bladeren bron

feapder框架打包组件

dongzhaorui 2 jaren geleden
bovenliggende
commit
7c453ff972

+ 11 - 0
zgztb_cookie/FworkSpider/MANIFEST.in

@@ -0,0 +1,11 @@
+include README.md
+include LICENSE
+
+include feapder/requirements.txt
+include feapder/VERSION
+
+recursive-include feapder/utils/js *
+recursive-include feapder/templates *
+recursive-include tests *
+
+global-exclude __pycache__ *.py[cod]

+ 1 - 0
zgztb_cookie/FworkSpider/README.md

@@ -0,0 +1 @@
+# 中国招标投标公共服务平台

+ 156 - 0
zgztb_cookie/FworkSpider/feapder/dedup/README.md

@@ -0,0 +1,156 @@
+# Dedup
+
+Dedup是feapder大数据去重模块,内置3种去重机制,使用方式一致,可容纳的去重数据量与内存有关。不同于BloomFilter,去重受槽位数量影响,Dedup使用了弹性的去重机制,可容纳海量的数据去重。
+
+
+## 去重方式
+
+### 临时去重
+
+> 基于redis,支持批量,去重有时效性。去重一万条数据约0.26秒,一亿条数据占用内存约1.43G
+
+```
+from feapder.dedup import Dedup
+
+data = {"xxx": 123, "xxxx": "xxxx"}
+datas = ["xxx", "bbb"]
+
+def test_ExpireFilter():
+    dedup = Dedup(
+        Dedup.ExpireFilter, expire_time=10, redis_url="redis://@localhost:6379/0"
+    )
+
+    # 逐条去重
+    assert dedup.add(data) == 1
+    assert dedup.get(data) == 1
+
+    # 批量去重
+    assert dedup.add(datas) == [1, 1]
+    assert dedup.get(datas) == [1, 1]
+```
+
+
+### 内存去重
+
+> 基于内存,支持批量。去重一万条数据约0.5秒,一亿条数据占用内存约285MB
+
+```
+from feapder.dedup import Dedup
+
+data = {"xxx": 123, "xxxx": "xxxx"}
+datas = ["xxx", "bbb"]
+
+def test_MemoryFilter():
+    dedup = Dedup(Dedup.MemoryFilter)  # 表名为test 历史数据3秒有效期
+
+    # 逐条去重
+    assert dedup.add(data) == 1
+    assert dedup.get(data) == 1
+
+    # 批量去重
+    assert dedup.add(datas) == [1, 1]
+    assert dedup.get(datas) == [1, 1]
+```
+
+### 永久去重
+
+> 基于redis,支持批量,永久去重。 去重一万条数据约3.5秒,一亿条数据占用内存约285MB
+
+    from feapder.dedup import Dedup
+
+    datas = {
+        "xxx": xxx,
+        "xxxx": "xxxx",
+    }
+
+    dedup = Dedup()
+
+    print(dedup) # <ScalableBloomFilter: RedisBitArray: dedup:bloomfilter:bloomfilter>
+    print(dedup.add(datas)) # 0 不存在
+    print(dedup.get(datas)) # 1 存在
+    
+## 过滤数据
+
+Dedup可以通过如下方法,过滤掉已存在的数据
+
+
+```python
+from feapder.dedup import Dedup
+
+def test_filter():
+    dedup = Dedup(Dedup.BloomFilter, redis_url="redis://@localhost:6379/0")
+
+    # 制造已存在数据
+    datas = ["xxx", "bbb"]
+    dedup.add(datas)
+
+    # 过滤掉已存在数据 "xxx", "bbb"
+    datas = ["xxx", "bbb", "ccc"]
+    dedup.filter_exist_data(datas)
+    assert datas == ["ccc"]
+```
+
+```python
+# redis cluster 去重
+from feapder.dedup import Dedup
+
+def test_filter():
+    dedup = Dedup(Dedup.RedisFilter, ip_ports=["192.168.3.207:2179", "192.168.3.166:2379"], expire_time=60)
+
+    # 制造已存在数据
+    datas = ["xxx", "bbb"]
+    dedup.add(datas)
+
+    # 过滤掉已存在数据 "xxx", "bbb"
+    datas = ["xxx", "bbb", "ccc"]
+    ss = dedup.filter_exist_data(datas)
+    print(ss)
+    assert datas == ["ccc"]
+```
+
+```python
+# redis 去重
+from feapder.dedup import Dedup
+
+def test_filter():
+    dedup = Dedup(Dedup.RedisFilter, expire_time=60)
+
+    # 制造已存在数据
+    datas = ["xxx", "bbb"]
+    dedup.add(datas)
+
+    # 过滤掉已存在数据 "xxx", "bbb"
+    datas = ["xxx", "bbb", "ccc"]
+    ss = dedup.filter_exist_data(datas)
+    print(ss)
+    assert datas == ["ccc"]
+```
+
+```python
+# redis 多实例去重
+from feapder.dedup import Dedup
+
+def test_filter():
+    redis_conf = dict(
+        pylist_=dict(
+            redisdb_ip_port="192.168.3.71:8371",
+            redisdb_user_pass="top@123",
+            redisdb_db=0
+        ),
+        list_=dict(
+            redisdb_ip_port="192.168.3.165:8165",
+            redisdb_user_pass="",
+            redisdb_db=0
+        )
+    )
+    
+    dedup = Dedup(filter_type=6, to_md5=False, redis_conf=redis_conf, expire_time=60)
+    datas = ["xxx", "bbb"]
+    dedup.add(datas)
+    
+    # 过滤掉已存在数据 "xxx", "bbb"
+    datas = ["xxx", "bbb", "ccc"]
+    dedup.filter_exist_data(datas)
+    print(datas)
+    assert datas == ["ccc"]
+```

+ 1 - 9
zgztb_cookie/FworkSpider/feapder/network/request.py

@@ -9,7 +9,6 @@ Created on 2018-07-25 11:49:08
 """
 
 import requests
-from func_timeout import func_set_timeout, FunctionTimedOut
 from requests.adapters import HTTPAdapter
 from requests.cookies import RequestsCookieJar
 from requests.packages.urllib3.exceptions import InsecureRequestWarning
@@ -252,7 +251,6 @@ class Request(object):
             else self.callback
         )
 
-    # @func_set_timeout(30, allowOverride=True)
     def get_response(self, save_cached=False):
         """
         获取带有selector功能的response
@@ -508,13 +506,7 @@ class Request(object):
         response_dict = self._cache_db.strget(self._cached_redis_key)
         if not response_dict:
             log.info("无response缓存  重新下载")
-            try:
-                response_obj = self.get_response(save_cached=save_cached)
-            except FunctionTimedOut:
-                response_obj = None
-                log.info("请求超时")
-                log.info("requests", extra={"url": self.url, "code": 0})
-
+            response_obj = self.get_response(save_cached=save_cached)
         else:
             response_dict = eval(response_dict)
             response_obj = Response.from_dict(response_dict)

+ 3 - 0
zgztb_cookie/FworkSpider/requirements.txt

@@ -0,0 +1,3 @@
+func-timeout==4.3.5
+tqdm==4.64.0
+oss2==2.14.0

+ 2 - 0
zgztb_cookie/FworkSpider/setup.cfg

@@ -0,0 +1,2 @@
+[easy_install]
+index_url = https://mirrors.aliyun.com/pypi/simple

+ 77 - 0
zgztb_cookie/FworkSpider/setup.py

@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2020/4/22 10:45 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+from os.path import dirname, join
+from sys import version_info
+
+import setuptools
+
+if version_info < (3, 6, 0):
+    raise SystemExit("Sorry! feapder requires python 3.6.0 or later.")
+
+with open(join(dirname(__file__), "feapder/VERSION"), "rb") as fh:
+    version = fh.read().decode("ascii").strip()
+
+with open("README.md", "r", encoding="utf8") as fh:
+    long_description = fh.read()
+
+packages = setuptools.find_packages()
+packages.extend(
+    [
+        "feapder",
+        "feapder.templates",
+        "feapder.templates.project_template",
+        "feapder.templates.project_template.spiders",
+        "feapder.templates.project_template.items",
+    ]
+)
+
+requires = [
+    "better-exceptions>=0.2.2",
+    "DBUtils>=2.0",
+    "parsel>=1.5.2",
+    "PyMySQL>=0.9.3",
+    "redis>=2.10.6,<4.0.0",
+    "requests>=2.22.0",
+    "PySocks==1.7.1",
+    "bs4>=0.0.1",
+    "ipython>=7.14.0,<=8.12",
+    "redis-py-cluster>=2.1.0",
+    "cryptography>=3.3.2",
+    "selenium==3.141.0",
+    "pymongo>=3.10.1",
+    "urllib3>=1.25.8,<=1.25.11",
+    "loguru>=0.5.3",
+    "influxdb>=5.3.1",
+    "pyperclip>=1.8.2",
+    "terminal-layout>=2.1.3",
+    "python-logstash==0.4.8",
+]
+
+extras_requires = ["bitarray>=1.5.3", "PyExecJS>=1.5.1"]
+
+setuptools.setup(
+    name="feapder",
+    version=version,
+    author="Boris",
+    license="MIT",
+    author_email="",
+    python_requires=">=3.6",
+    description="feapder是一款支持分布式、批次采集、任务防丢、报警丰富的python爬虫框架",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    install_requires=requires,
+    extras_require={"all": extras_requires},
+    entry_points={"console_scripts": ["feapder = feapder.commands.cmdline:execute"]},
+    url="https://github.com/Boris-code/feapder.git",
+    packages=packages,
+    include_package_data=True,
+    classifiers=["Programming Language :: Python :: 3"],
+)

+ 8 - 5
zgztb_cookie/README.md

@@ -1,16 +1,19 @@
-#### 中国招标投标公共服务平台
+# 中国招标投标公共服务平台
 
-#### docker 构建和启动容器
+## feapder框架 -- FworkSpider
+#### docker镜像构建
     $ docker build -t centos7_zgzb:v1.0 .
+#### docker容器启动
     $ docker-compose --compatibility up -d
 
-##### 快照页执行脚本
+## 爬虫业务
+#### 快照页执行脚本
     $ ./start.sh
 
-##### 列表页执行脚本
+#### 列表页执行脚本
     $ python3 zgzbtb_spider.py
 
-##### crontab配置
+## crontab配置
     // 列表页采集爬虫
     10 * * * * flock -xn /app/zgzbtb_spider.py -c 'cd /app && python3 zgzbtb_spider.py'
     // 详情页爬虫