1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586 |
- #!/usr/bin/python3.6
- # -*- coding: utf-8 -*-
- # @Time : 2021/3/15 9:22
- # @Author : lijunliang
- # @Email : lijunliang@topnet.net.cn
- # @File : convert2text.py
- # @Software: PyCharm
- import pdfplumber
- # from loguru import logger
- # import docx
- from pdfminer.pdfparser import PDFParser
- from pdfminer.pdfdocument import PDFDocument
- from pdfminer.pdfpage import PDFPage
- from pdfminer.pdfdevice import PDFDevice
- from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
- from pdfminer.converter import PDFPageAggregator
- from pdfminer.layout import LTTextBoxHorizontal, LAParams
- from pdfminer.pdfpage import PDFTextExtractionNotAllowed
- def pdf_get_pages(pdf_path: str) -> (int, str):
- fp = open(pdf_path, 'rb')
- parser = PDFParser(fp)
- document = PDFDocument(parser)
- result = ""
- count = 0
- if not document.is_extractable:
- raise PDFTextExtractionNotAllowed
- else:
- rsrcmgr = PDFResourceManager()
- laparams = LAParams()
- device = PDFPageAggregator(rsrcmgr, laparams=laparams)
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- for page in PDFPage.create_pages(document):
- interpreter.process_page(page)
- layout = device.get_result()
- for x in layout:
- if (isinstance(x, LTTextBoxHorizontal)):
- result += x.get_text()
- count += 1
- fp.close()
- return count, result
- def pdf_get_pages1(pdf_path: str) -> int:
- """
- 解析pdf文件,但不保留表格样式
- :param pdf_path: pdf文件
- :return:
- """
- try:
- pdf = pdfplumber.open(pdf_path)
- # 提取pdf文本内容
- pages = pdf.pages
- pageCount = len(pages)
- return pageCount
- except Exception as e:
- logger.warning("pdf获取页数失败--->%s" % e)
- return 0
- def word_get_text(word_path: str) -> str:
- '''
- 获取docX word文档内容
- :param word_path:
- :return:
- '''
- # 获取文档对象
- try:
- file = docx.Document(word_path)
- content = ""
- for para in file.paragraphs:
- content += para.text
- except Exception as e:
- logger.warning("获取docx文本失败--->%s" % e)
- return ""
- return content
- if __name__ == '__main__':
- # pdf_path = ""
- pdf_path = "../data/1234.pdf"
- pdf_data = pdf_get_pages(pdf_path)
- print(pdf_data)
|