#!/usr/bin/python3.6 # -*- coding: utf-8 -*- # @Author : lijunliang # @Email : lijunliang@topnet.net.cn # @File : parse_file.py # @Software: PyCharm from util.convert2text import pdf_get_pages, word_get_text from util.file_operations import save_file from util.convert2pdf import lib2pdf, html2pdf from util.file_operations import file_copy from module.convertPdfServer import convert_pdf_start from module.convertTxtServer import convert_txt_start from loguru import logger import os # docType = ['doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', "txt",html,htm] def parse_file_start(file_path: str) -> (str, int): """ 解析文本 :param file_path: :return: 文本,长度,pdf_path路径 """ # 获取后缀判断 prefix_suffix = file_path.split(".") suffix = prefix_suffix[-1] suffix = suffix.lower() suffix = suffix.strip() prefix_suffix[-2] = prefix_suffix[-2] + "1" prefix_suffix[-1] = "pdf" pdf_path = ".".join(prefix_suffix) # 后缀为pdf print(f"--->{suffix}") if suffix == "pdf": # 解析pdf state = file_copy(file_path, pdf_path) if not state: # 拷贝失败 return "", 0, pdf_path # 后缀为html elif suffix in ["html", "htm"]: state = html2pdf(file_path, pdf_path) if not state: return "", 0, pdf_path # 后缀为doc,docx,'xls', 'xlsx' 由于兼容问题调用windows服务速度慢 elif suffix in ["doc", "docx"]: stat, request_ret = convert_pdf_start(file_path, suffix) if not stat: return "", 0, pdf_path state = save_file(request_ret, pdf_path) if not state: # 接收转存失败 return "", 0, pdf_path # 后缀为'ppt', 'pptx'调用linux服务速度快 elif suffix in ['ppt', 'pptx', "txt"]: pdf_path = lib2pdf(file_path) # libreoffice else: return "", 0, "" if not pdf_path: return "", 0, pdf_path # 解析pdf pages,textes = pdf_get_pages(pdf_path) if not pages: return "", 0, pdf_path stat, text = convert_txt_start(pdf_path) if not stat: text=textes new_text = text.replace("\n", " ").replace("\r", "") new_text=new_text.strip() return new_text, pages, pdf_path def get_property(file_path: str) -> (str, str, float): ''' 获取文件属性 :param file_path: 文件类型 :return:文件类型,后缀,大小字节 ''' suffix = file_path.split(".")[-1] suffix = suffix.lower() doctype = suffix.strip() # 文件类型 suffix = "." + doctype # 文件后缀 fileSize = os.path.getsize(file_path) # return doctype, suffix, fileSize if __name__ == '__main__': import chardet ret = parse_file_start("../data/1234.pdf") # print(chardet.detect(ret[0])['encoding']) print(ret)