12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 |
- #!/usr/bin/python3.6
- # -*- coding: utf-8 -*-
- # @Author : lijunliang
- # @Email : lijunliang@topnet.net.cn
- # @File : parse_file.py
- # @Software: PyCharm
- from util.convert2text import pdf_get_pages, word_get_text
- from util.file_operations import save_file
- from util.convert2pdf import lib2pdf, html2pdf
- from util.file_operations import file_copy
- from module.convertPdfServer import convert_pdf_start
- from module.convertTxtServer import convert_txt_start
- from loguru import logger
- import os
- # docType = ['doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', "txt",html,htm]
- def parse_file_start(file_path: str) -> (str, int):
- """
- 解析文本
- :param file_path:
- :return: 文本,长度,pdf_path路径
- """
- # 获取后缀判断
- prefix_suffix = file_path.split(".")
- suffix = prefix_suffix[-1]
- suffix = suffix.lower()
- suffix = suffix.strip()
- prefix_suffix[-2] = prefix_suffix[-2] + "1"
- prefix_suffix[-1] = "pdf"
- pdf_path = ".".join(prefix_suffix)
- # 后缀为pdf
- print(f"--->{suffix}")
- if suffix == "pdf": # 解析pdf
- state = file_copy(file_path, pdf_path)
- if not state: # 拷贝失败
- return "", 0, pdf_path
- # 后缀为html
- elif suffix in ["html", "htm"]:
- state = html2pdf(file_path, pdf_path)
- if not state:
- return "", 0, pdf_path
- # 后缀为doc,docx,'xls', 'xlsx' 由于兼容问题调用windows服务速度慢
- elif suffix in ["doc", "docx"]:
- stat, request_ret = convert_pdf_start(file_path, suffix)
- if not stat:
- return "", 0, pdf_path
- state = save_file(request_ret, pdf_path)
- if not state: # 接收转存失败
- return "", 0, pdf_path
- # 后缀为'ppt', 'pptx'调用linux服务速度快
- elif suffix in ['ppt', 'pptx', "txt"]:
- pdf_path = lib2pdf(file_path) # libreoffice
- else:
- return "", 0, ""
- if not pdf_path:
- return "", 0, pdf_path
- # 解析pdf
- pages,textes = pdf_get_pages(pdf_path)
- if not pages:
- return "", 0, pdf_path
- stat, text = convert_txt_start(pdf_path)
- if not stat:
- text=textes
- new_text = text.replace("\n", " ").replace("\r", "")
- new_text=new_text.strip()
- return new_text, pages, pdf_path
- def get_property(file_path: str) -> (str, str, float):
- '''
- 获取文件属性
- :param file_path: 文件类型
- :return:文件类型,后缀,大小字节
- '''
- suffix = file_path.split(".")[-1]
- suffix = suffix.lower()
- doctype = suffix.strip() # 文件类型
- suffix = "." + doctype # 文件后缀
- fileSize = os.path.getsize(file_path) #
- return doctype, suffix, fileSize
- if __name__ == '__main__':
- import chardet
- ret = parse_file_start("../data/1234.pdf")
- # print(chardet.detect(ret[0])['encoding'])
- print(ret)
|