|
@@ -11,7 +11,7 @@ import os
|
|
|
sys.path.append(os.path.dirname(os.getcwd()))
|
|
|
from utils.attachment import AttachmentDownloader
|
|
|
from utils.tools import *
|
|
|
-from utils.aliyun import AliYunService
|
|
|
+from utils.aliyun import JyOssClient
|
|
|
import requests
|
|
|
import json
|
|
|
import base64
|
|
@@ -101,19 +101,21 @@ class Details:
|
|
|
res = requests.post(url, headers=headers, data=data, timeout=60, proxies=self.proxy, verify=False)
|
|
|
|
|
|
file_list = res.json().get('data')
|
|
|
-
|
|
|
- if file_list:
|
|
|
-
|
|
|
- for info in file_list:
|
|
|
- file_name = info.get('filename')
|
|
|
- file_url = f"https://b2b.10086.cn/api-b2b/api-file/file/downloadFileOnAuth?authFlag={info.get('authFlag')}&fileId={info.get('fileId')}&fileUuid={info.get('uuid')}"
|
|
|
- file_type = extract_file_type(file_name, file_url)
|
|
|
-
|
|
|
- attachment = AttachmentDownloader().fetch_attachment(
|
|
|
- file_name=file_name, file_type=file_type, download_url=file_url,
|
|
|
- proxies=self.proxy)
|
|
|
- if attachment.__contains__("fid"):
|
|
|
- attachments[str(len(attachments) + 1)] = attachment
|
|
|
+ if not file_list:
|
|
|
+ return attachments
|
|
|
+
|
|
|
+ downloader = AttachmentDownloader()
|
|
|
+ for info in file_list:
|
|
|
+ file_name = info.get('filename')
|
|
|
+ file_url = f"https://b2b.10086.cn/api-b2b/api-file/file/downloadFileOnAuth?authFlag={info.get('authFlag')}&fileId={info.get('fileId')}&fileUuid={info.get('uuid')}"
|
|
|
+ file_type = extract_file_type(file_name, file_url)
|
|
|
+ attachment = downloader.fetch_attachment(
|
|
|
+ file_name=file_name,
|
|
|
+ file_type=file_type,
|
|
|
+ download_url=file_url,
|
|
|
+ proxies=self.proxy
|
|
|
+ )
|
|
|
+ attachments[str(len(attachments) + 1)] = attachment
|
|
|
except:
|
|
|
pass
|
|
|
|
|
@@ -122,22 +124,27 @@ class Details:
|
|
|
def detail_get(self, response, item):
|
|
|
|
|
|
detail_info = response.json().get('data')
|
|
|
- html = detail_info.get('noticeContent')
|
|
|
+
|
|
|
+ attr2 = detail_info.get('uuid')
|
|
|
+ attachments = self.get_attachments(attr2)
|
|
|
+
|
|
|
html_file = {}
|
|
|
+ html = detail_info.get('noticeContent')
|
|
|
if len(html) > 100 and text_search(html).total == 0:
|
|
|
stream = base64.b64decode(html)
|
|
|
- fnm = item['title']
|
|
|
- if len(fnm) > 20:
|
|
|
- fnm = "附件"
|
|
|
+
|
|
|
+ fnm = "附件" if len(item['title']) > 20 else item['title']
|
|
|
file = f"file/{fnm}.pdf"
|
|
|
directory = os.path.dirname(file)
|
|
|
if not os.path.exists(directory):
|
|
|
os.makedirs(directory)
|
|
|
+
|
|
|
with open(file, 'wb') as f:
|
|
|
f.write(stream)
|
|
|
+
|
|
|
content_hash = get_sha1(stream)
|
|
|
fid = "{}.{}".format(content_hash, 'pdf')
|
|
|
- AliYunService().push_oss_from_local(fid, file)
|
|
|
+ JyOssClient().upload("file", fid, stream)
|
|
|
html_file = {
|
|
|
"filename": item['title'],
|
|
|
"org_url": item['href'],
|
|
@@ -146,24 +153,23 @@ class Details:
|
|
|
"ftype": "pdf",
|
|
|
"url": "oss"
|
|
|
}
|
|
|
+
|
|
|
try:
|
|
|
os.remove(file)
|
|
|
os.rmdir(file.replace('.pdf',''))
|
|
|
except FileNotFoundError:
|
|
|
pass
|
|
|
- html = "详情请访问原网页!"
|
|
|
|
|
|
- item["contenthtml"] = html
|
|
|
+ html = "详情请访问原网页!"
|
|
|
|
|
|
- attr2 = detail_info.get('uuid')
|
|
|
- attachments = self.get_attachments(attr2)
|
|
|
if html_file:
|
|
|
attachments[str(len(attachments) + 1)] = html_file
|
|
|
+
|
|
|
+ item["contenthtml"] = html
|
|
|
if attachments:
|
|
|
item["projectinfo"] = {"attachments": attachments}
|
|
|
|
|
|
item = format_fileds(item)
|
|
|
-
|
|
|
try:
|
|
|
self.zt_details.insert_one(item)
|
|
|
logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
|
|
@@ -172,8 +178,12 @@ class Details:
|
|
|
|
|
|
def fetch_request(self, item):
|
|
|
request_params = item.get("request_params")
|
|
|
- response = requests.post(url=item.get("parse_url"), headers=self.headers, json=request_params,
|
|
|
- proxies=self.proxy, timeout=(30, 60), verify=False)
|
|
|
+ response = requests.post(url=item.get("parse_url"),
|
|
|
+ headers=self.headers,
|
|
|
+ json=request_params,
|
|
|
+ proxies=self.proxy,
|
|
|
+ timeout=(30, 60),
|
|
|
+ verify=False)
|
|
|
|
|
|
return response
|
|
|
|