ai
/
library


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
							#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
# @Author  : lijunliang
# @Email   : lijunliang@topnet.net.cn
# @File    : parse_file.py
# @Software: PyCharm
from util.convert2text import pdf_get_pages, word_get_text
from util.file_operations import save_file
from util.convert2pdf import lib2pdf, html2pdf
from util.file_operations import file_copy
from module.convertPdfServer import convert_pdf_start
from module.convertTxtServer import convert_txt_start
from loguru import logger
import os


# docType = ['doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', "txt",html,htm]


def parse_file_start(file_path: str) -> (str, int):
    """
    解析文本
    :param file_path:
    :return: 文本，长度，pdf_path路径
    """
    # 获取后缀判断
    prefix_suffix = file_path.split(".")
    suffix = prefix_suffix[-1]
    suffix = suffix.lower()
    suffix = suffix.strip()
    prefix_suffix[-2] = prefix_suffix[-2] + "1"
    prefix_suffix[-1] = "pdf"
    pdf_path = ".".join(prefix_suffix)
    # 后缀为pdf
    print(f"--->{suffix}")
    if suffix == "pdf":  # 解析pdf
        state = file_copy(file_path, pdf_path)
        if not state:  # 拷贝失败
            return "", 0, pdf_path

    # 后缀为html
    elif suffix in ["html", "htm"]:
        state = html2pdf(file_path, pdf_path)
        if not state:
            return "", 0, pdf_path

    # 后缀为doc,docx,'xls', 'xlsx' 由于兼容问题调用windows服务速度慢
    elif suffix in ["doc", "docx"]:
        stat, request_ret = convert_pdf_start(file_path, suffix)
        if not stat:
            return "", 0, pdf_path
        state = save_file(request_ret, pdf_path)
        if not state:  # 接收转存失败
            return "", 0, pdf_path

    # 后缀为'ppt', 'pptx'调用linux服务速度快
    elif suffix in ['ppt', 'pptx', "txt"]:
        pdf_path = lib2pdf(file_path)  # libreoffice
    else:
        return "", 0, ""
    if not pdf_path:
        return "", 0, pdf_path
    # 解析pdf
    pages,textes = pdf_get_pages(pdf_path)
    if not pages:
        return "", 0, pdf_path
    stat, text = convert_txt_start(pdf_path)
    if not stat:
        text=textes
    new_text = text.replace("\n", " ").replace("\r", "")
    new_text=new_text.strip()
    return new_text, pages, pdf_path


def get_property(file_path: str) -> (str, str, float):
    '''
    获取文件属性
    :param file_path: 文件类型
    :return:文件类型，后缀，大小字节
    '''
    suffix = file_path.split(".")[-1]
    suffix = suffix.lower()
    doctype = suffix.strip()  # 文件类型
    suffix = "." + doctype  # 文件后缀
    fileSize = os.path.getsize(file_path)  #
    return doctype, suffix, fileSize


if __name__ == '__main__':
    import chardet

    ret = parse_file_start("../data/1234.pdf")
    # print(chardet.detect(ret[0])['encoding'])
    print(ret)