3
0

attachment.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. import hashlib
  2. import os
  3. import re
  4. import traceback
  5. import uuid
  6. from urllib.parse import urlparse, unquote
  7. import requests
  8. import urllib3
  9. import sys
  10. sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider')
  11. from feapder.setting import headers
  12. from untils.execptions import AttachmentNullError
  13. from untils.aliyun import AliYunService
  14. from untils.proxy_pool import ProxyPool
  15. urllib3.disable_warnings()
  16. def sha1(val):
  17. _sha1 = hashlib.sha1()
  18. if isinstance(val, bytes):
  19. _sha1.update(str(val).encode("utf-8"))
  20. elif isinstance(val, str):
  21. _sha1.update(val.encode("utf-8"))
  22. return _sha1.hexdigest()
  23. def remove(file_path: str):
  24. os.remove(file_path)
  25. def getsize(file_path: str):
  26. try:
  27. return os.path.getsize(file_path)
  28. except FileNotFoundError:
  29. return 0
  30. def discern_file_format(text):
  31. file_types = {
  32. 'pdf', 'doc', 'docx', 'rar', 'zip', 'gzzb', 'jpg', 'png', 'swf'
  33. }
  34. for file_type in file_types:
  35. all_file_format = [file_type, file_type.upper()]
  36. for t in all_file_format:
  37. result = re.match(f'.*{t}$', text, re.S)
  38. if result is not None:
  39. return t
  40. else:
  41. return None
  42. def extract_file_type(text):
  43. if text is None:
  44. return None
  45. return discern_file_format(text)
  46. def extract_file_name_by_href(href: str, file_type: str):
  47. """从url中抽取文件名称"""
  48. # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
  49. # 中文字符:[\u4e00 -\u9fa5]
  50. zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
  51. parser = urlparse(href)
  52. query = (parser.query or parser.path)
  53. result = re.search(f'.*\\.{file_type}', query, re.S)
  54. if result is not None:
  55. encode_str = unquote(result.group())
  56. name = re.search(zh_char_pattern, encode_str)
  57. if name is not None:
  58. return unquote(name.group())
  59. return None
  60. def extract_file_name(text):
  61. file_type = discern_file_format(text)
  62. if file_type is not None:
  63. repl = '.{}'.format(file_type)
  64. text = text.replace(repl, '')
  65. return text
  66. def verify_file_name(name):
  67. if extract_file_type(name) is None:
  68. raise ValueError
  69. class AttachmentNullError(Exception):
  70. def __init__(self, code: int = 10004, reason: str = '附件下载异常', **kwargs):
  71. self.code = code
  72. self.reason = reason
  73. self.err_details = kwargs
  74. for key, val in kwargs.items():
  75. setattr(self, key, val)
  76. class AttachmentDownloader:
  77. def __init__(self):
  78. self.dir_name = 'file'
  79. def get_file_path(self, filename, file_type):
  80. os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
  81. sha1_name = sha1("{}_{}".format(filename, uuid.uuid4()))
  82. tmp_name = "{}.{}".format(sha1_name, file_type)
  83. return "{}/{}".format(self.dir_name, tmp_name)
  84. @staticmethod
  85. def create_fid(file_stream: bytes):
  86. return sha1(file_stream)
  87. @staticmethod
  88. def file_size(file_path: str):
  89. _kb = float(getsize(file_path)) / 1024
  90. if _kb >= 1024:
  91. _M = _kb / 1024
  92. if _M >= 1024:
  93. _G = _M / 1024
  94. return "{:.1f} G".format(_G)
  95. else:
  96. return "{:.1f} M".format(_M)
  97. else:
  98. return "{:.1f} kb".format(_kb)
  99. @staticmethod
  100. def _fetch_attachment(
  101. url: str,
  102. file_path: str,
  103. enable_proxy=False,
  104. allow_show_exception=False,
  105. **kwargs
  106. ):
  107. request_params = {}
  108. request_params.setdefault('headers', kwargs.get('headers') or headers)
  109. request_params.setdefault('proxies', kwargs.get('proxies'))
  110. request_params.setdefault('timeout', kwargs.get('timeout') or 60)
  111. request_params.setdefault('stream', kwargs.get('stream') or True)
  112. request_params.setdefault('verify', kwargs.get('verify') or False)
  113. if enable_proxy:
  114. proxy = ProxyPool()
  115. else:
  116. proxy = {}
  117. retries = 0
  118. while retries < 3:
  119. try:
  120. with requests.get(url, **request_params) as req:
  121. if req.status_code == 200:
  122. stream = req.content
  123. with open(file_path, 'wb') as f:
  124. f.write(stream)
  125. return stream
  126. else:
  127. retries += 1
  128. except requests.RequestException:
  129. if allow_show_exception:
  130. traceback.print_exc()
  131. if enable_proxy:
  132. request_params.update({'proxies': proxy.get()})
  133. retries += 1
  134. return b''
  135. def fetch_attachment(
  136. self,
  137. file_name: str,
  138. file_type: str,
  139. download_url: str,
  140. enable_proxy=False,
  141. allow_request_exception=False,
  142. **kwargs
  143. ):
  144. if not file_name or not file_type or not download_url:
  145. raise AttachmentNullError
  146. file_path = self.get_file_path(file_name, file_type)
  147. file_stream = self._fetch_attachment(
  148. download_url,
  149. file_path,
  150. enable_proxy,
  151. allow_request_exception,
  152. **kwargs
  153. )
  154. if len(file_stream) > 0:
  155. fid = self.create_fid(file_stream)
  156. '''上传/下载,无论失败/成功最终返回附件信息'''
  157. try:
  158. result = {
  159. 'filename': '{}.{}'.format(file_name, file_type),
  160. 'ftype': file_type,
  161. 'fid': "{}.{}".format(fid, file_type),
  162. 'org_url': download_url,
  163. 'size': self.file_size(file_path),
  164. 'url': 'oss',
  165. }
  166. AliYunService().push_oss_from_local(result['fid'], file_path)
  167. except Exception:
  168. result = {
  169. 'filename': '{}.{}'.format(file_name, file_type),
  170. 'org_url': download_url,
  171. }
  172. else:
  173. result = {
  174. 'filename': '{}.{}'.format(file_name, file_type),
  175. 'org_url': download_url,
  176. }
  177. remove(file_path)
  178. return result
  179. if __name__ == '__main__':
  180. a = AttachmentDownloader().fetch_attachment(
  181. file_name='成建制移民村(五标段)合同',
  182. file_type='pdf',
  183. download_url='http://222.75.70.90/NXGPPSP_MG/downloadFileServlet?req=F&num=8b80b23f7e729b88017e758a1b03422c'
  184. )
  185. print(a)