Browse Source

oss正文附件上传方式版本回退

dzr 1 tháng trước cách đây
mục cha
commit
375d35fcce

+ 10 - 2
zbytb/config/conf.yaml

@@ -20,5 +20,13 @@ proxy:
     auth:
       Authorization: Basic amlhbnl1MDAxOjEyM3F3ZSFB
 
-oss:
-  address: http://172.17.162.27:18011
+# 阿里oss
+ali_oss:
+  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
+  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
+#  endpoint: oss-cn-beijing.aliyuncs.com    # 公网使用
+  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
+  bucket_name: jy-datafile
+  jy:
+    address: http://172.17.162.27:18011
+    address_test: http://172.31.31.203:1111

+ 2 - 2
zbytb/config/load.py

@@ -4,7 +4,7 @@ import yaml
 
 __all__ = [
     'mongo_conf',
-    'oss_address',
+    'ali_oss',
     'jy_proxy',
     'es_conf',
     'headers',
@@ -21,7 +21,7 @@ with open(_yaml_conf, encoding="utf-8") as f:
     mongo_conf = conf['mongo']
     es_conf: dict = conf['es']
     jy_proxy: dict = conf['proxy']
-    oss_address = conf['oss']["address"]
+    ali_oss: dict = conf['ali_oss']
 
 with open(_yaml_constants, encoding="utf-8") as fp:
     constants = yaml.safe_load(fp)

+ 1 - 1
zbytb/crawler/login.py

@@ -10,7 +10,7 @@ from requests.utils import dict_from_cookiejar
 _lock = threading.Lock()
 ROOT_PATH = Path(__file__).parent.parent
 
-JSON_LOGIN_COOKIE = (ROOT_PATH / 'config/login_cookie.json.json').resolve()
+JSON_LOGIN_COOKIE = (ROOT_PATH / 'config/login_cookie.json').resolve()
 
 User = namedtuple('User', ['username', 'password'])
 

+ 26 - 25
zbytb/crawler/spiders/DetailPageSpider.py

@@ -11,9 +11,9 @@ from crawler.clean_html import cleaner, clean_js
 from crawler.crawl_scheduler import Scheduler
 from crawler.defaults import http_request_get
 from crawler.login import load_login_cookies, login, User, login_status_check
-from utils.attachment import (
+from utils.attachment import AttachmentDownloader
+from utils.clean_file import (
     extract_file_type,
-    AttachmentDownloader,
     extract_file_name_by_href
 )
 from utils.databases import mongo_table, int2long
@@ -112,35 +112,32 @@ class CrawlDetailPageSpider:
         logger.error(err_msg)
 
     def download_attachment(self, content: str, rows: dict):
-        index = 0
-        attachments = {}
         soup = BeautifulSoup(content, "lxml")
-        all_node = soup.findAll("a") or soup.findAll("iframe")
-        for node in all_node:
+        attachments = {}
+        nums = 0
+        nodes = soup.findAll("a") or soup.findAll("iframe")
+        for node in nodes:
             file_name, file_type = (node.string or node.text), None
             file_path = node.attrs.get("href", "") or node.attrs.get("src", "")
             # 附件可能包含在一个iframe中
             _id = node.attrs.get('id')
             if _id == 'pdfContainer':
                 file_type = 'pdf'
+
             # 抽取文件类型
-            elif file_type is None:
-                file_type = (extract_file_type(file_name)
-                             or extract_file_type(file_path))
+            if file_type is None:
+                file_type = (extract_file_type(file_name) or extract_file_type(file_path))
 
             # 抽取文件名称
             try:
                 parser = urlparse(file_path)
-            except ValueError:
-                pass
-            else:
                 if parser.scheme in ['https', 'http'] and file_type is not None:
                     if not file_name:
                         name = extract_file_name_by_href(file_path, file_type)
                         if name is not None:
                             file_name = name
                         else:
-                            file_name = f"{rows['title']}_{index}"
+                            file_name = f"{rows['title']}_{nums}"
 
                     attachment = self.attachment_downloader.download(
                         file_name=file_name,
@@ -148,8 +145,10 @@ class CrawlDetailPageSpider:
                         download_url=file_path,
                     )
                     if len(attachment) > 0:
-                        attachments[str(index + 1)] = attachment
-                        index += 1
+                        attachments[str(len(attachments) + 1)] = attachment
+                        nums += 1
+            except ValueError:
+                pass
 
         file_url = soup.findAll('pdfpath')
         if file_url:
@@ -157,13 +156,13 @@ class CrawlDetailPageSpider:
             file_type = extract_file_type(file_url)
             file_name = rows['title']
             if file_type:
-                attachment2 = self.attachment_downloader.download(
+                attachment = self.attachment_downloader.download(
                     file_name=file_name,
                     file_type=file_type,
-                    download_url=file_path,
+                    download_url=file_url,
                 )
-            if len(attachment2) > 0:
-                attachments[str(len(attachments) + 1)] = attachment2
+                if len(attachment) > 0:
+                    attachments[str(len(attachments) + 1)] = attachment
 
         if len(attachments) > 0:
             rows["projectinfo"] = {"attachments": attachments}
@@ -195,12 +194,14 @@ class CrawlDetailPageSpider:
             root = Selector(text=response.text)
             content = root.xpath('//div[@class="conent-box"]').extract_first()
             if content:
-                extr_html1 = root.xpath('//div[@class="conent-box"]/div[@class="xgxm"]').extract_first()
-                extr_html2 = root.xpath('//div[@class="content-user"]').extract_first()
-                if extr_html1:
-                    content = content.replace(extr_html1,'')
-                if extr_html2:
-                    content = content.replace(extr_html2,'')
+                clean_features = [
+                    '//div[@class="conent-box"]/div[@class="xgxm"]',
+                    '//div[@class="content-user"]'
+                ]
+                for feature in clean_features:
+                    clean_html = root.xpath(feature).extract_first()
+                    if clean_html is not None:
+                        content = content.replace(clean_html, '')
             else:
                 content = ''
 

+ 50 - 11
zbytb/utils/attachment.py

@@ -4,10 +4,11 @@ import traceback
 import uuid
 from io import BytesIO
 
+import oss2
 import requests
 import urllib3
 
-from config.load import headers, oss_address
+from config.load import headers, ali_oss
 from utils.clean_file import (
     clean_file_name,
     judge_file_url,
@@ -106,11 +107,48 @@ class OssClient(object):
         return reply
 
 
-class AttachmentDownloader:
+class OssBucketClient:
 
     def __init__(self):
+        key_id = ali_oss['key_id']
+        key_secret = ali_oss['key_secret']
+        endpoint = ali_oss['endpoint']
+        bucket_name = ali_oss['bucket_name']
+        auth = oss2.Auth(key_id, key_secret)
+        self._bucket = oss2.Bucket(auth, endpoint, bucket_name)
+
+    def push_oss_from_local(self, key, filename):
+        """
+        上传一个本地文件到OSS的普通文件
+
+        :param str key: 上传到OSS的文件名
+        :param str filename: 本地文件名,需要有可读权限
+        """
+        return self._bucket.put_object_from_file(key, filename)
+
+    def push_oss_from_stream(self, key, data):
+        """
+        流式上传oss
+
+        :param str key: 上传到OSS的文件名
+        :param data: 待上传的内容。
+        :type data: bytes,str或file-like object
+        """
+        return self._bucket.put_object(key, data)
+
+
+class AttachmentDownloader:
+
+    def __init__(self, address=None, mode=None):
         self.dir_name = 'file'
-        self._oss = OssClient(domain=oss_address)
+        if address is None:
+            address = ali_oss['jy']['address']
+
+        if mode == 'test':
+            address = ali_oss['jy']['address_test']
+
+        # self._oss = OssClient(domain=address)
+        self._bucket = OssBucketClient()
 
     def _create_file(self, filename, filetype):
         os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
@@ -255,14 +293,15 @@ class AttachmentDownloader:
                 result.setdefault('ftype', file_type)
                 result.setdefault('size', self._file_size(local_tmp_file))
                 result.setdefault('url', 'oss')
-
-                args = {
-                    "bucket_id": "file",
-                    "object_name": key,
-                    "gzip": False,
-                    "stream": file_stream
-                }
-                self._oss.upload(args, err_show=True)
+                self._bucket.push_oss_from_local(key, local_tmp_file)
+
+                # args = {
+                #     "bucket_id": "file",
+                #     "object_name": key,
+                #     "gzip": False,
+                #     "stream": file_stream
+                # }
+                # self._oss.upload(args, err_show=True)
             except Exception as e:
                 logger.warning(
                     "[{}]下载异常,原因:{}".format(file_name, type(e).__name__)