convert2text.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. #!/usr/bin/python3.6
  2. # -*- coding: utf-8 -*-
  3. # @Time : 2021/3/15 9:22
  4. # @Author : lijunliang
  5. # @Email : lijunliang@topnet.net.cn
  6. # @File : convert2text.py
  7. # @Software: PyCharm
  8. import pdfplumber
  9. # from loguru import logger
  10. # import docx
  11. from pdfminer.pdfparser import PDFParser
  12. from pdfminer.pdfdocument import PDFDocument
  13. from pdfminer.pdfpage import PDFPage
  14. from pdfminer.pdfdevice import PDFDevice
  15. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  16. from pdfminer.converter import PDFPageAggregator
  17. from pdfminer.layout import LTTextBoxHorizontal, LAParams
  18. from pdfminer.pdfpage import PDFTextExtractionNotAllowed
  19. def pdf_get_pages(pdf_path: str) -> (int, str):
  20. fp = open(pdf_path, 'rb')
  21. parser = PDFParser(fp)
  22. document = PDFDocument(parser)
  23. result = ""
  24. count = 0
  25. if not document.is_extractable:
  26. raise PDFTextExtractionNotAllowed
  27. else:
  28. rsrcmgr = PDFResourceManager()
  29. laparams = LAParams()
  30. device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  31. interpreter = PDFPageInterpreter(rsrcmgr, device)
  32. for page in PDFPage.create_pages(document):
  33. interpreter.process_page(page)
  34. layout = device.get_result()
  35. for x in layout:
  36. if (isinstance(x, LTTextBoxHorizontal)):
  37. result += x.get_text()
  38. count += 1
  39. fp.close()
  40. return count, result
  41. def pdf_get_pages1(pdf_path: str) -> int:
  42. """
  43. 解析pdf文件,但不保留表格样式
  44. :param pdf_path: pdf文件
  45. :return:
  46. """
  47. try:
  48. pdf = pdfplumber.open(pdf_path)
  49. # 提取pdf文本内容
  50. pages = pdf.pages
  51. pageCount = len(pages)
  52. return pageCount
  53. except Exception as e:
  54. logger.warning("pdf获取页数失败--->%s" % e)
  55. return 0
  56. def word_get_text(word_path: str) -> str:
  57. '''
  58. 获取docX word文档内容
  59. :param word_path:
  60. :return:
  61. '''
  62. # 获取文档对象
  63. try:
  64. file = docx.Document(word_path)
  65. content = ""
  66. for para in file.paragraphs:
  67. content += para.text
  68. except Exception as e:
  69. logger.warning("获取docx文本失败--->%s" % e)
  70. return ""
  71. return content
  72. if __name__ == '__main__':
  73. # pdf_path = ""
  74. pdf_path = "../data/1234.pdf"
  75. pdf_data = pdf_get_pages(pdf_path)
  76. print(pdf_data)