1 月之前 · 375d35fcce
--- a/zbytb/config/conf.yaml
+++ b/zbytb/config/conf.yaml
@@ -20,5 +20,13 @@ proxy:
 
															     auth:
														
 
															       Authorization: Basic amlhbnl1MDAxOjEyM3F3ZSFB
														
 
															-oss:
														
 
															-  address: http://172.17.162.27:18011
														
 
															+# 阿里oss
														
 
															+ali_oss:
														
 
															+  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
														
 
															+  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
														
 
															+#  endpoint: oss-cn-beijing.aliyuncs.com    # 公网使用
														
 
															+  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
														
 
															+  bucket_name: jy-datafile
														
 
															+  jy:
														
 
															+    address: http://172.17.162.27:18011
														
 
															+    address_test: http://172.31.31.203:1111
														
--- a/zbytb/config/load.py
+++ b/zbytb/config/load.py
@@ -4,7 +4,7 @@ import yaml
 
															 __all__ = [
														
 
															     'mongo_conf',
														
 
															-    'oss_address',
														
 
															+    'ali_oss',
														
 
															     'jy_proxy',
														
 
															     'es_conf',
														
 
															     'headers',
														
@@ -21,7 +21,7 @@ with open(_yaml_conf, encoding="utf-8") as f:
 
															     mongo_conf = conf['mongo']
														
 
															     es_conf: dict = conf['es']
														
 
															     jy_proxy: dict = conf['proxy']
														
 
															-    oss_address = conf['oss']["address"]
														
 
															+    ali_oss: dict = conf['ali_oss']
														
 
															 with open(_yaml_constants, encoding="utf-8") as fp:
														
 
															     constants = yaml.safe_load(fp)
														
--- a/zbytb/crawler/login.py
+++ b/zbytb/crawler/login.py
@@ -10,7 +10,7 @@ from requests.utils import dict_from_cookiejar
 
															 _lock = threading.Lock()
														
 
															 ROOT_PATH = Path(__file__).parent.parent
														
 
															-JSON_LOGIN_COOKIE = (ROOT_PATH / 'config/login_cookie.json.json').resolve()
														
 
															+JSON_LOGIN_COOKIE = (ROOT_PATH / 'config/login_cookie.json').resolve()
														
 
															 User = namedtuple('User', ['username', 'password'])
														
--- a/zbytb/crawler/spiders/DetailPageSpider.py
+++ b/zbytb/crawler/spiders/DetailPageSpider.py
@@ -11,9 +11,9 @@ from crawler.clean_html import cleaner, clean_js
 
															 from crawler.crawl_scheduler import Scheduler
														
 
															 from crawler.defaults import http_request_get
														
 
															 from crawler.login import load_login_cookies, login, User, login_status_check
														
 
															-from utils.attachment import (
														
 
															+from utils.attachment import AttachmentDownloader
														
 
															+from utils.clean_file import (
														
 
															     extract_file_type,
														
 
															-    AttachmentDownloader,
														
 
															     extract_file_name_by_href
														
 
															 )
														
 
															 from utils.databases import mongo_table, int2long
														
@@ -112,35 +112,32 @@ class CrawlDetailPageSpider:
 
															         logger.error(err_msg)
														
 
															     def download_attachment(self, content: str, rows: dict):
														
 
															-        index = 0
														
 
															-        attachments = {}
														
 
															         soup = BeautifulSoup(content, "lxml")
														
 
															-        all_node = soup.findAll("a") or soup.findAll("iframe")
														
 
															-        for node in all_node:
														
 
															+        attachments = {}
														
 
															+        nums = 0
														
 
															+        nodes = soup.findAll("a") or soup.findAll("iframe")
														
 
															+        for node in nodes:
														
 
															             file_name, file_type = (node.string or node.text), None
														
 
															             file_path = node.attrs.get("href", "") or node.attrs.get("src", "")
														
 
															             # 附件可能包含在一个iframe中
														
 
															             _id = node.attrs.get('id')
														
 
															             if _id == 'pdfContainer':
														
 
															                 file_type = 'pdf'
														
 
															+
														
 
															             # 抽取文件类型
														
 
															-            elif file_type is None:
														
 
															-                file_type = (extract_file_type(file_name)
														
 
															-                             or extract_file_type(file_path))
														
 
															+            if file_type is None:
														
 
															+                file_type = (extract_file_type(file_name) or extract_file_type(file_path))
														
 
															             # 抽取文件名称
														
 
															             try:
														
 
															                 parser = urlparse(file_path)
														
 
															-            except ValueError:
														
 
															-                pass
														
 
															-            else:
														
 
															                 if parser.scheme in ['https', 'http'] and file_type is not None:
														
 
															                     if not file_name:
														
 
															                         name = extract_file_name_by_href(file_path, file_type)
														
 
															                         if name is not None:
														
 
															                             file_name = name
														
 
															                         else:
														
 
															-                            file_name = f"{rows['title']}_{index}"
														
 
															+                            file_name = f"{rows['title']}_{nums}"
														
 
															                     attachment = self.attachment_downloader.download(
														
 
															                         file_name=file_name,
														
@@ -148,8 +145,10 @@ class CrawlDetailPageSpider:
 
															                         download_url=file_path,
														
 
															                     )
														
 
															                     if len(attachment) > 0:
														
 
															-                        attachments[str(index + 1)] = attachment
														
 
															-                        index += 1
														
 
															+                        attachments[str(len(attachments) + 1)] = attachment
														
 
															+                        nums += 1
														
 
															+            except ValueError:
														
 
															+                pass
														
 
															         file_url = soup.findAll('pdfpath')
														
 
															         if file_url:
														
@@ -157,13 +156,13 @@ class CrawlDetailPageSpider:
 
															             file_type = extract_file_type(file_url)
														
 
															             file_name = rows['title']
														
 
															             if file_type:
														
 
															-                attachment2 = self.attachment_downloader.download(
														
 
															+                attachment = self.attachment_downloader.download(
														
 
															                     file_name=file_name,
														
 
															                     file_type=file_type,
														
 
															-                    download_url=file_path,
														
 
															+                    download_url=file_url,
														
 
															                 )
														
 
															-            if len(attachment2) > 0:
														
 
															-                attachments[str(len(attachments) + 1)] = attachment2
														
 
															+                if len(attachment) > 0:
														
 
															+                    attachments[str(len(attachments) + 1)] = attachment
														
 
															         if len(attachments) > 0:
														
 
															             rows["projectinfo"] = {"attachments": attachments}
														
@@ -195,12 +194,14 @@ class CrawlDetailPageSpider:
 
															             root = Selector(text=response.text)
														
 
															             content = root.xpath('//div[@class="conent-box"]').extract_first()
														
 
															             if content:
														
 
															-                extr_html1 = root.xpath('//div[@class="conent-box"]/div[@class="xgxm"]').extract_first()
														
 
															-                extr_html2 = root.xpath('//div[@class="content-user"]').extract_first()
														
 
															-                if extr_html1:
														
 
															-                    content = content.replace(extr_html1,'')
														
 
															-                if extr_html2:
														
 
															-                    content = content.replace(extr_html2,'')
														
 
															+                clean_features = [
														
 
															+                    '//div[@class="conent-box"]/div[@class="xgxm"]',
														
 
															+                    '//div[@class="content-user"]'
														
 
															+                ]
														
 
															+                for feature in clean_features:
														
 
															+                    clean_html = root.xpath(feature).extract_first()
														
 
															+                    if clean_html is not None:
														
 
															+                        content = content.replace(clean_html, '')
														
 
															             else:
														
 
															                 content = ''
														
--- a/zbytb/utils/attachment.py
+++ b/zbytb/utils/attachment.py
@@ -4,10 +4,11 @@ import traceback
 
															 import uuid
														
 
															 from io import BytesIO
														
 
															+import oss2
														
 
															 import requests
														
 
															 import urllib3
														
 
															-from config.load import headers, oss_address
														
 
															+from config.load import headers, ali_oss
														
 
															 from utils.clean_file import (
														
 
															     clean_file_name,
														
 
															     judge_file_url,
														
@@ -106,11 +107,48 @@ class OssClient(object):
 
															         return reply
														
 
															-class AttachmentDownloader:
														
 
															+class OssBucketClient:
														
 
															     def __init__(self):
														
 
															+        key_id = ali_oss['key_id']
														
 
															+        key_secret = ali_oss['key_secret']
														
 
															+        endpoint = ali_oss['endpoint']
														
 
															+        bucket_name = ali_oss['bucket_name']
														
 
															+        auth = oss2.Auth(key_id, key_secret)
														
 
															+        self._bucket = oss2.Bucket(auth, endpoint, bucket_name)
														
 
															+
														
 
															+    def push_oss_from_local(self, key, filename):
														
 
															+        """
														
 
															+        上传一个本地文件到OSS的普通文件
														
 
															+
														
 
															+        :param str key: 上传到OSS的文件名
														
 
															+        :param str filename: 本地文件名，需要有可读权限
														
 
															+        """
														
 
															+        return self._bucket.put_object_from_file(key, filename)
														
 
															+
														
 
															+    def push_oss_from_stream(self, key, data):
														
 
															+        """
														
 
															+        流式上传oss
														
 
															+
														
 
															+        :param str key: 上传到OSS的文件名
														
 
															+        :param data: 待上传的内容。
														
 
															+        :type data: bytes，str或file-like object
														
 
															+        """
														
 
															+        return self._bucket.put_object(key, data)
														
 
															+
														
 
															+
														
 
															+class AttachmentDownloader:
														
 
															+
														
 
															+    def __init__(self, address=None, mode=None):
														
 
															         self.dir_name = 'file'
														
 
															-        self._oss = OssClient(domain=oss_address)
														
 
															+        if address is None:
														
 
															+            address = ali_oss['jy']['address']
														
 
															+
														
 
															+        if mode == 'test':
														
 
															+            address = ali_oss['jy']['address_test']
														
 
															+
														
 
															+        # self._oss = OssClient(domain=address)
														
 
															+        self._bucket = OssBucketClient()
														
 
															     def _create_file(self, filename, filetype):
														
 
															         os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
														
@@ -255,14 +293,15 @@ class AttachmentDownloader:
 
															                 result.setdefault('ftype', file_type)
														
 
															                 result.setdefault('size', self._file_size(local_tmp_file))
														
 
															                 result.setdefault('url', 'oss')
														
 
															-
														
 
															-                args = {
														
 
															-                    "bucket_id": "file",
														
 
															-                    "object_name": key,
														
 
															-                    "gzip": False,
														
 
															-                    "stream": file_stream
														
 
															-                }
														
 
															-                self._oss.upload(args, err_show=True)
														
 
															+                self._bucket.push_oss_from_local(key, local_tmp_file)
														
 
															+
														
 
															+                # args = {
														
 
															+                #     "bucket_id": "file",
														
 
															+                #     "object_name": key,
														
 
															+                #     "gzip": False,
														
 
															+                #     "stream": file_stream
														
 
															+                # }
														
 
															+                # self._oss.upload(args, err_show=True)
														
 
															             except Exception as e:
														
 
															                 logger.warning(
														
 
															                     "[{}]下载异常,原因:{}".format(file_name, type(e).__name__)