#!/usr/bin/python3.6 # -*- coding: utf-8 -*- # @Time : 2021/3/15 9:22 # @Author : lijunliang # @Email : lijunliang@topnet.net.cn # @File : convert2text.py # @Software: PyCharm import pdfplumber # from loguru import logger # import docx from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfdevice import PDFDevice from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LTTextBoxHorizontal, LAParams from pdfminer.pdfpage import PDFTextExtractionNotAllowed def pdf_get_pages(pdf_path: str) -> (int, str): fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) result = "" count = 0 if not document.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): result += x.get_text() count += 1 fp.close() return count, result def pdf_get_pages1(pdf_path: str) -> int: """ 解析pdf文件,但不保留表格样式 :param pdf_path: pdf文件 :return: """ try: pdf = pdfplumber.open(pdf_path) # 提取pdf文本内容 pages = pdf.pages pageCount = len(pages) return pageCount except Exception as e: logger.warning("pdf获取页数失败--->%s" % e) return 0 def word_get_text(word_path: str) -> str: ''' 获取docX word文档内容 :param word_path: :return: ''' # 获取文档对象 try: file = docx.Document(word_path) content = "" for para in file.paragraphs: content += para.text except Exception as e: logger.warning("获取docx文本失败--->%s" % e) return "" return content if __name__ == '__main__': # pdf_path = "" pdf_path = "../data/1234.pdf" pdf_data = pdf_get_pages(pdf_path) print(pdf_data)