Ver Fonte

更新附件抽取规则

lizongze há 1 ano atrás
pai
commit
dc2d30411c
1 ficheiros alterados com 7 adições e 9 exclusões
  1. 7 9
      FworkSpider/untils/tools.py

+ 7 - 9
FworkSpider/untils/tools.py

@@ -388,8 +388,8 @@ def extract_file_type(file_name="附件名", file_url="附件地址",file_type_l
         file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
                       'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
         if file_type_list:
-            file_type_list = list(map(lambda x: x.lower(), file_type_list))
-            file_types.extend(file_type_list)
+            ftp_list = list(map(lambda x: x.lower(), file_type_list))
+            file_types.extend(ftp_list)
 
         file_type = file_url.split('?')[0].split('.')[-1].lower()
         if file_type not in file_types:
@@ -397,13 +397,11 @@ def extract_file_type(file_name="附件名", file_url="附件地址",file_type_l
             if file_type in file_types:
                 return file_type
             else:
-                file_type = file_name.split('?')[0].split('.')[-1].lower()
-                if file_type in file_types:
-                    return file_type
-                else:
-                    file_type = file_name.split('?')[-1].split('.')[-1].lower()
-                    if file_type in file_types:
-                        return file_type
+                for ftp in file_types:
+                    file_type = re.search(ftp, file_name) or re.search("\." + ftp, file_url)
+                    if file_type:
+                        return file_type.group(0).replace('.','')
+
         else:
             return file_type
     return None