|
@@ -7,8 +7,6 @@ from urllib.parse import urlparse, unquote
|
|
|
|
|
|
import requests
|
|
|
import urllib3
|
|
|
-import sys
|
|
|
-sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider')
|
|
|
|
|
|
from feapder.setting import headers
|
|
|
from untils.execptions import AttachmentNullError
|
|
@@ -18,33 +16,26 @@ from untils.proxy_pool import ProxyPool
|
|
|
urllib3.disable_warnings()
|
|
|
|
|
|
|
|
|
-def sha1(val):
|
|
|
- _sha1 = hashlib.sha1()
|
|
|
+def hex_sha1(val):
|
|
|
+ sha1 = hashlib.sha1()
|
|
|
if isinstance(val, bytes):
|
|
|
- _sha1.update(str(val).encode("utf-8"))
|
|
|
+ sha1.update(str(val).encode("utf-8"))
|
|
|
elif isinstance(val, str):
|
|
|
- _sha1.update(val.encode("utf-8"))
|
|
|
- return _sha1.hexdigest()
|
|
|
+ sha1.update(val.encode("utf-8"))
|
|
|
+ res = sha1.hexdigest()
|
|
|
+ return res
|
|
|
|
|
|
|
|
|
-def remove(file_path: str):
|
|
|
- os.remove(file_path)
|
|
|
-
|
|
|
-
|
|
|
-def getsize(file_path: str):
|
|
|
- try:
|
|
|
- return os.path.getsize(file_path)
|
|
|
- except FileNotFoundError:
|
|
|
- return 0
|
|
|
-
|
|
|
+def extract_file_type(text):
|
|
|
+ if text is None:
|
|
|
+ return None
|
|
|
|
|
|
-def discern_file_format(text):
|
|
|
file_types = {
|
|
|
- 'pdf', 'doc', 'docx', 'rar', 'zip', 'gzzb', 'jpg', 'png', 'swf'
|
|
|
+ 'pdf', 'doc', 'docx', 'rar', 'zip', 'gzzb', 'jpg', 'png'
|
|
|
}
|
|
|
for file_type in file_types:
|
|
|
- all_file_format = [file_type, file_type.upper()]
|
|
|
- for t in all_file_format:
|
|
|
+ tmp = [file_type, file_type.upper()]
|
|
|
+ for t in tmp:
|
|
|
result = re.match(f'.*{t}$', text, re.S)
|
|
|
if result is not None:
|
|
|
return t
|
|
@@ -52,14 +43,7 @@ def discern_file_format(text):
|
|
|
return None
|
|
|
|
|
|
|
|
|
-def extract_file_type(text):
|
|
|
- if text is None:
|
|
|
- return None
|
|
|
- return discern_file_format(text)
|
|
|
-
|
|
|
-
|
|
|
-def extract_file_name_by_href(href: str, file_type: str):
|
|
|
- """从url中抽取文件名称"""
|
|
|
+def extract_file_name(href: str, file_type: str):
|
|
|
# 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
|
|
|
# 中文字符:[\u4e00 -\u9fa5]
|
|
|
zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
|
|
@@ -74,56 +58,29 @@ def extract_file_name_by_href(href: str, file_type: str):
|
|
|
return None
|
|
|
|
|
|
|
|
|
-def extract_file_name(text):
|
|
|
- file_type = discern_file_format(text)
|
|
|
- if file_type is not None:
|
|
|
- repl = '.{}'.format(file_type)
|
|
|
- text = text.replace(repl, '')
|
|
|
- return text
|
|
|
-
|
|
|
-
|
|
|
def verify_file_name(name):
|
|
|
if extract_file_type(name) is None:
|
|
|
raise ValueError
|
|
|
|
|
|
|
|
|
-class AttachmentNullError(Exception):
|
|
|
-
|
|
|
- def __init__(self, code: int = 10004, reason: str = '附件下载异常', **kwargs):
|
|
|
- self.code = code
|
|
|
- self.reason = reason
|
|
|
- self.err_details = kwargs
|
|
|
- for key, val in kwargs.items():
|
|
|
- setattr(self, key, val)
|
|
|
-
|
|
|
-
|
|
|
class AttachmentDownloader:
|
|
|
|
|
|
def __init__(self):
|
|
|
- self.dir_name = 'file'
|
|
|
+ self.dir_name = '/file'
|
|
|
+
|
|
|
+ def create_dir(self):
|
|
|
+ if not os.path.exists(self.dir_name):
|
|
|
+ os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
|
|
|
|
|
|
- def get_file_path(self, filename, file_type):
|
|
|
- os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
|
|
|
- sha1_name = sha1("{}_{}".format(filename, uuid.uuid4()))
|
|
|
- tmp_name = "{}.{}".format(sha1_name, file_type)
|
|
|
+ def create_file_path(self, filename, file_type):
|
|
|
+ self.create_dir()
|
|
|
+ sign = hex_sha1("{}_{}".format(filename, uuid.uuid4()))
|
|
|
+ tmp_name = "{}.{}".format(sign, file_type)
|
|
|
return "{}/{}".format(self.dir_name, tmp_name)
|
|
|
|
|
|
@staticmethod
|
|
|
def create_fid(file_stream: bytes):
|
|
|
- return sha1(file_stream)
|
|
|
-
|
|
|
- @staticmethod
|
|
|
- def file_size(file_path: str):
|
|
|
- _kb = float(getsize(file_path)) / 1024
|
|
|
- if _kb >= 1024:
|
|
|
- _M = _kb / 1024
|
|
|
- if _M >= 1024:
|
|
|
- _G = _M / 1024
|
|
|
- return "{:.1f} G".format(_G)
|
|
|
- else:
|
|
|
- return "{:.1f} M".format(_M)
|
|
|
- else:
|
|
|
- return "{:.1f} kb".format(_kb)
|
|
|
+ return hex_sha1(file_stream)
|
|
|
|
|
|
@staticmethod
|
|
|
def _fetch_attachment(
|
|
@@ -162,6 +119,29 @@ class AttachmentDownloader:
|
|
|
retries += 1
|
|
|
return b''
|
|
|
|
|
|
+ @staticmethod
|
|
|
+ def clean_attachment(file_path):
|
|
|
+ os.remove(file_path)
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def getsize(file_path: str):
|
|
|
+ def _getsize(filename):
|
|
|
+ try:
|
|
|
+ return os.path.getsize(filename)
|
|
|
+ except:
|
|
|
+ return 0
|
|
|
+
|
|
|
+ _kb = float(_getsize(file_path)) / 1024
|
|
|
+ if _kb >= 1024:
|
|
|
+ _M = _kb / 1024
|
|
|
+ if _M >= 1024:
|
|
|
+ _G = _M / 1024
|
|
|
+ return "{:.1f} G".format(_G)
|
|
|
+ else:
|
|
|
+ return "{:.1f} M".format(_M)
|
|
|
+ else:
|
|
|
+ return "{:.1f} kb".format(_kb)
|
|
|
+
|
|
|
def fetch_attachment(
|
|
|
self,
|
|
|
file_name: str,
|
|
@@ -174,7 +154,7 @@ class AttachmentDownloader:
|
|
|
if not file_name or not file_type or not download_url:
|
|
|
raise AttachmentNullError
|
|
|
|
|
|
- file_path = self.get_file_path(file_name, file_type)
|
|
|
+ file_path = self.create_file_path(file_name, file_type)
|
|
|
file_stream = self._fetch_attachment(
|
|
|
download_url,
|
|
|
file_path,
|
|
@@ -184,35 +164,35 @@ class AttachmentDownloader:
|
|
|
)
|
|
|
if len(file_stream) > 0:
|
|
|
fid = self.create_fid(file_stream)
|
|
|
- '''上传/下载,无论失败/成功最终返回附件信息'''
|
|
|
+ '''上传/下载,无论失败成功都需要给出文件基础信息'''
|
|
|
try:
|
|
|
result = {
|
|
|
- 'filename': '{}.{}'.format(file_name, file_type),
|
|
|
+ 'filename': file_name,
|
|
|
'ftype': file_type,
|
|
|
'fid': "{}.{}".format(fid, file_type),
|
|
|
'org_url': download_url,
|
|
|
- 'size': self.file_size(file_path),
|
|
|
+ 'size': self.getsize(file_path),
|
|
|
'url': 'oss',
|
|
|
}
|
|
|
AliYunService().push_oss_from_local(result['fid'], file_path)
|
|
|
except Exception:
|
|
|
result = {
|
|
|
- 'filename': '{}.{}'.format(file_name, file_type),
|
|
|
+ 'filename': file_name,
|
|
|
'org_url': download_url,
|
|
|
}
|
|
|
+ self.clean_attachment(file_path)
|
|
|
else:
|
|
|
result = {
|
|
|
- 'filename': '{}.{}'.format(file_name, file_type),
|
|
|
+ 'filename': file_name,
|
|
|
'org_url': download_url,
|
|
|
}
|
|
|
- remove(file_path)
|
|
|
return result
|
|
|
|
|
|
|
|
|
-if __name__ == '__main__':
|
|
|
- a = AttachmentDownloader().fetch_attachment(
|
|
|
- file_name='成建制移民村(五标段)合同',
|
|
|
- file_type='pdf',
|
|
|
- download_url='http://222.75.70.90/NXGPPSP_MG/downloadFileServlet?req=F&num=8b80b23f7e729b88017e758a1b03422c'
|
|
|
- )
|
|
|
- print(a)
|
|
|
+# if __name__ == '__main__':
|
|
|
+ # a = AttachmentDownloader().fetch_attachment(
|
|
|
+ # file_name='成建制移民村(五标段)合同',
|
|
|
+ # file_type='pdf',
|
|
|
+ # download_url='http://222.75.70.90/NXGPPSP_MG/downloadFileServlet?req=F&num=8b80b23f7e729b88017e758a1b03422c'
|
|
|
+ # )
|
|
|
+ # print(a)
|