Browse Source

tools增加附件类型抽取方法

lizongze 2 years ago
parent
commit
b902aeeb75
1 changed files with 25 additions and 0 deletions
  1. 25 0
      FworkSpider/untils/tools.py

+ 25 - 0
FworkSpider/untils/tools.py

@@ -364,3 +364,28 @@ def remove_htmldata(remove_info_list:list, html:str, response):
                 if extra_html:
                     html = html.replace(extra_html, '')
     return html
+
+
+def extract_file_type(file_name="附件名", file_url="附件地址"):
+    """
+        抽取附件类型
+    Args:
+        file_name: 附件名
+        file_url: 附件地址
+
+    Returns:
+
+    """
+    if file_name and file_url:
+        file_name = file_name.strip()
+        file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
+                      'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
+
+        file_type = file_url.split('?')[0].split('.')[-1].lower()
+        if file_type not in file_types:
+            file_type = file_name.split('?')[0].split('.')[-1].lower()
+            if file_type in file_types:
+                return file_type
+        else:
+            return file_type
+    return None