1 tháng trước cách đây · 375d35fcce
--- a/zbytb/config/conf.yaml
+++ b/zbytb/config/conf.yaml
@@ -20,5 +20,13 @@ proxy:
 
				     auth:
			
 
				       Authorization: Basic amlhbnl1MDAxOjEyM3F3ZSFB
			
 
				 
			
 
				-oss:
			
 
				-  address: http://172.17.162.27:18011
			
 
				+# 阿里oss
			
 
				+ali_oss:
			
 
				+  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
			
 
				+  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
			
 
				+#  endpoint: oss-cn-beijing.aliyuncs.com    # 公网使用
			
 
				+  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
			
 
				+  bucket_name: jy-datafile
			
 
				+  jy:
			
 
				+    address: http://172.17.162.27:18011
			
 
				+    address_test: http://172.31.31.203:1111
			
--- a/zbytb/config/load.py
+++ b/zbytb/config/load.py
@@ -4,7 +4,7 @@ import yaml
 
				 
			
 
				 __all__ = [
			
 
				     'mongo_conf',
			
 
				-    'oss_address',
			
 
				+    'ali_oss',
			
 
				     'jy_proxy',
			
 
				     'es_conf',
			
 
				     'headers',
			
@@ -21,7 +21,7 @@ with open(_yaml_conf, encoding="utf-8") as f:
 
				     mongo_conf = conf['mongo']
			
 
				     es_conf: dict = conf['es']
			
 
				     jy_proxy: dict = conf['proxy']
			
 
				-    oss_address = conf['oss']["address"]
			
 
				+    ali_oss: dict = conf['ali_oss']
			
 
				 
			
 
				 with open(_yaml_constants, encoding="utf-8") as fp:
			
 
				     constants = yaml.safe_load(fp)
			
--- a/zbytb/crawler/login.py
+++ b/zbytb/crawler/login.py
@@ -10,7 +10,7 @@ from requests.utils import dict_from_cookiejar
 
				 _lock = threading.Lock()
			
 
				 ROOT_PATH = Path(__file__).parent.parent
			
 
				 
			
 
				-JSON_LOGIN_COOKIE = (ROOT_PATH / 'config/login_cookie.json.json').resolve()
			
 
				+JSON_LOGIN_COOKIE = (ROOT_PATH / 'config/login_cookie.json').resolve()
			
 
				 
			
 
				 User = namedtuple('User', ['username', 'password'])
			
 
				 
			
--- a/zbytb/crawler/spiders/DetailPageSpider.py
+++ b/zbytb/crawler/spiders/DetailPageSpider.py
@@ -11,9 +11,9 @@ from crawler.clean_html import cleaner, clean_js
 
				 from crawler.crawl_scheduler import Scheduler
			
 
				 from crawler.defaults import http_request_get
			
 
				 from crawler.login import load_login_cookies, login, User, login_status_check
			
 
				-from utils.attachment import (
			
 
				+from utils.attachment import AttachmentDownloader
			
 
				+from utils.clean_file import (
			
 
				     extract_file_type,
			
 
				-    AttachmentDownloader,
			
 
				     extract_file_name_by_href
			
 
				 )
			
 
				 from utils.databases import mongo_table, int2long
			
@@ -112,35 +112,32 @@ class CrawlDetailPageSpider:
 
				         logger.error(err_msg)
			
 
				 
			
 
				     def download_attachment(self, content: str, rows: dict):
			
 
				-        index = 0
			
 
				-        attachments = {}
			
 
				         soup = BeautifulSoup(content, "lxml")
			
 
				-        all_node = soup.findAll("a") or soup.findAll("iframe")
			
 
				-        for node in all_node:
			
 
				+        attachments = {}
			
 
				+        nums = 0
			
 
				+        nodes = soup.findAll("a") or soup.findAll("iframe")
			
 
				+        for node in nodes:
			
 
				             file_name, file_type = (node.string or node.text), None
			
 
				             file_path = node.attrs.get("href", "") or node.attrs.get("src", "")
			
 
				             # 附件可能包含在一个iframe中
			
 
				             _id = node.attrs.get('id')
			
 
				             if _id == 'pdfContainer':
			
 
				                 file_type = 'pdf'
			
 
				+
			
 
				             # 抽取文件类型
			
 
				-            elif file_type is None:
			
 
				-                file_type = (extract_file_type(file_name)
			
 
				-                             or extract_file_type(file_path))
			
 
				+            if file_type is None:
			
 
				+                file_type = (extract_file_type(file_name) or extract_file_type(file_path))
			
 
				 
			
 
				             # 抽取文件名称
			
 
				             try:
			
 
				                 parser = urlparse(file_path)
			
 
				-            except ValueError:
			
 
				-                pass
			
 
				-            else:
			
 
				                 if parser.scheme in ['https', 'http'] and file_type is not None:
			
 
				                     if not file_name:
			
 
				                         name = extract_file_name_by_href(file_path, file_type)
			
 
				                         if name is not None:
			
 
				                             file_name = name
			
 
				                         else:
			
 
				-                            file_name = f"{rows['title']}_{index}"
			
 
				+                            file_name = f"{rows['title']}_{nums}"
			
 
				 
			
 
				                     attachment = self.attachment_downloader.download(
			
 
				                         file_name=file_name,
			
@@ -148,8 +145,10 @@ class CrawlDetailPageSpider:
 
				                         download_url=file_path,
			
 
				                     )
			
 
				                     if len(attachment) > 0:
			
 
				-                        attachments[str(index + 1)] = attachment
			
 
				-                        index += 1
			
 
				+                        attachments[str(len(attachments) + 1)] = attachment
			
 
				+                        nums += 1
			
 
				+            except ValueError:
			
 
				+                pass
			
 
				 
			
 
				         file_url = soup.findAll('pdfpath')
			
 
				         if file_url:
			
@@ -157,13 +156,13 @@ class CrawlDetailPageSpider:
 
				             file_type = extract_file_type(file_url)
			
 
				             file_name = rows['title']
			
 
				             if file_type:
			
 
				-                attachment2 = self.attachment_downloader.download(
			
 
				+                attachment = self.attachment_downloader.download(
			
 
				                     file_name=file_name,
			
 
				                     file_type=file_type,
			
 
				-                    download_url=file_path,
			
 
				+                    download_url=file_url,
			
 
				                 )
			
 
				-            if len(attachment2) > 0:
			
 
				-                attachments[str(len(attachments) + 1)] = attachment2
			
 
				+                if len(attachment) > 0:
			
 
				+                    attachments[str(len(attachments) + 1)] = attachment
			
 
				 
			
 
				         if len(attachments) > 0:
			
 
				             rows["projectinfo"] = {"attachments": attachments}
			
@@ -195,12 +194,14 @@ class CrawlDetailPageSpider:
 
				             root = Selector(text=response.text)
			
 
				             content = root.xpath('//div[@class="conent-box"]').extract_first()
			
 
				             if content:
			
 
				-                extr_html1 = root.xpath('//div[@class="conent-box"]/div[@class="xgxm"]').extract_first()
			
 
				-                extr_html2 = root.xpath('//div[@class="content-user"]').extract_first()
			
 
				-                if extr_html1:
			
 
				-                    content = content.replace(extr_html1,'')
			
 
				-                if extr_html2:
			
 
				-                    content = content.replace(extr_html2,'')
			
 
				+                clean_features = [
			
 
				+                    '//div[@class="conent-box"]/div[@class="xgxm"]',
			
 
				+                    '//div[@class="content-user"]'
			
 
				+                ]
			
 
				+                for feature in clean_features:
			
 
				+                    clean_html = root.xpath(feature).extract_first()
			
 
				+                    if clean_html is not None:
			
 
				+                        content = content.replace(clean_html, '')
			
 
				             else:
			
 
				                 content = ''
			
 
				 
			
--- a/zbytb/utils/attachment.py
+++ b/zbytb/utils/attachment.py
@@ -4,10 +4,11 @@ import traceback
 
				 import uuid
			
 
				 from io import BytesIO
			
 
				 
			
 
				+import oss2
			
 
				 import requests
			
 
				 import urllib3
			
 
				 
			
 
				-from config.load import headers, oss_address
			
 
				+from config.load import headers, ali_oss
			
 
				 from utils.clean_file import (
			
 
				     clean_file_name,
			
 
				     judge_file_url,
			
@@ -106,11 +107,48 @@ class OssClient(object):
 
				         return reply
			
 
				 
			
 
				 
			
 
				-class AttachmentDownloader:
			
 
				+class OssBucketClient:
			
 
				 
			
 
				     def __init__(self):
			
 
				+        key_id = ali_oss['key_id']
			
 
				+        key_secret = ali_oss['key_secret']
			
 
				+        endpoint = ali_oss['endpoint']
			
 
				+        bucket_name = ali_oss['bucket_name']
			
 
				+        auth = oss2.Auth(key_id, key_secret)
			
 
				+        self._bucket = oss2.Bucket(auth, endpoint, bucket_name)
			
 
				+
			
 
				+    def push_oss_from_local(self, key, filename):
			
 
				+        """
			
 
				+        上传一个本地文件到OSS的普通文件
			
 
				+
			
 
				+        :param str key: 上传到OSS的文件名
			
 
				+        :param str filename: 本地文件名，需要有可读权限
			
 
				+        """
			
 
				+        return self._bucket.put_object_from_file(key, filename)
			
 
				+
			
 
				+    def push_oss_from_stream(self, key, data):
			
 
				+        """
			
 
				+        流式上传oss
			
 
				+
			
 
				+        :param str key: 上传到OSS的文件名
			
 
				+        :param data: 待上传的内容。
			
 
				+        :type data: bytes，str或file-like object
			
 
				+        """
			
 
				+        return self._bucket.put_object(key, data)
			
 
				+
			
 
				+
			
 
				+class AttachmentDownloader:
			
 
				+
			
 
				+    def __init__(self, address=None, mode=None):
			
 
				         self.dir_name = 'file'
			
 
				-        self._oss = OssClient(domain=oss_address)
			
 
				+        if address is None:
			
 
				+            address = ali_oss['jy']['address']
			
 
				+
			
 
				+        if mode == 'test':
			
 
				+            address = ali_oss['jy']['address_test']
			
 
				+
			
 
				+        # self._oss = OssClient(domain=address)
			
 
				+        self._bucket = OssBucketClient()
			
 
				 
			
 
				     def _create_file(self, filename, filetype):
			
 
				         os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
			
@@ -255,14 +293,15 @@ class AttachmentDownloader:
 
				                 result.setdefault('ftype', file_type)
			
 
				                 result.setdefault('size', self._file_size(local_tmp_file))
			
 
				                 result.setdefault('url', 'oss')
			
 
				-
			
 
				-                args = {
			
 
				-                    "bucket_id": "file",
			
 
				-                    "object_name": key,
			
 
				-                    "gzip": False,
			
 
				-                    "stream": file_stream
			
 
				-                }
			
 
				-                self._oss.upload(args, err_show=True)
			
 
				+                self._bucket.push_oss_from_local(key, local_tmp_file)
			
 
				+
			
 
				+                # args = {
			
 
				+                #     "bucket_id": "file",
			
 
				+                #     "object_name": key,
			
 
				+                #     "gzip": False,
			
 
				+                #     "stream": file_stream
			
 
				+                # }
			
 
				+                # self._oss.upload(args, err_show=True)
			
 
				             except Exception as e:
			
 
				                 logger.warning(
			
 
				                     "[{}]下载异常,原因:{}".format(file_name, type(e).__name__)