test_workflows.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. import os
  2. import binascii
  3. import sys
  4. import pytest
  5. from PyPDF2 import PdfFileReader, PdfFileWriter
  6. TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
  7. PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
  8. RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources")
  9. sys.path.append(PROJECT_ROOT)
  10. def test_PdfReaderFileLoad():
  11. """
  12. Test loading and parsing of a file. Extract text of the file and compare to expected
  13. textual output. Expected outcome: file loads, text matches expected.
  14. """
  15. with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile:
  16. # Load PDF file from file
  17. ipdf = PdfFileReader(inputfile)
  18. ipdf_p1 = ipdf.getPage(0)
  19. # Retrieve the text of the PDF
  20. with open(os.path.join(RESOURCE_ROOT, "crazyones.txt"), "rb") as pdftext_file:
  21. pdftext = pdftext_file.read()
  22. ipdf_p1_text = ipdf_p1.extractText().replace("\n", "").encode("utf-8")
  23. # Compare the text of the PDF to a known source
  24. assert ipdf_p1_text == pdftext, (
  25. "PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n"
  26. % (pdftext, ipdf_p1_text)
  27. )
  28. def test_PdfReaderJpegImage():
  29. """
  30. Test loading and parsing of a file. Extract the image of the file and compare to expected
  31. textual output. Expected outcome: file loads, image matches expected.
  32. """
  33. with open(os.path.join(RESOURCE_ROOT, "jpeg.pdf"), "rb") as inputfile:
  34. # Load PDF file from file
  35. ipdf = PdfFileReader(inputfile)
  36. # Retrieve the text of the image
  37. with open(os.path.join(RESOURCE_ROOT, "jpeg.txt"), "r") as pdftext_file:
  38. imagetext = pdftext_file.read()
  39. ipdf_p0 = ipdf.getPage(0)
  40. xObject = ipdf_p0["/Resources"]["/XObject"].getObject()
  41. data = xObject["/Im4"].getData()
  42. # Compare the text of the PDF to a known source
  43. assert binascii.hexlify(data).decode() == imagetext, (
  44. "PDF extracted image differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n"
  45. % (imagetext, binascii.hexlify(data).decode())
  46. )
  47. def test_read_metadata():
  48. with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile:
  49. ipdf = PdfFileReader(inputfile)
  50. metadict = ipdf.getDocumentInfo()
  51. assert metadict.title is None
  52. assert dict(metadict) == {
  53. "/CreationDate": "D:20150604133406-06'00'",
  54. "/Creator": " XeTeX output 2015.06.04:1334",
  55. "/Producer": "xdvipdfmx (20140317)",
  56. }
  57. def test_decrypt():
  58. with open(
  59. os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf"), "rb"
  60. ) as inputfile:
  61. ipdf = PdfFileReader(inputfile)
  62. assert ipdf.isEncrypted == True
  63. ipdf.decrypt("openpassword")
  64. assert ipdf.getNumPages() == 1
  65. assert ipdf.isEncrypted == True
  66. metadict = ipdf.getDocumentInfo()
  67. assert dict(metadict) == {
  68. "/CreationDate": "D:20220403203552+02'00'",
  69. "/Creator": "Writer",
  70. "/Producer": "LibreOffice 6.4",
  71. }
  72. # Is extractText() broken for encrypted files?
  73. # assert ipdf.getPage(0).extractText().replace('\n', '') == "\n˘\n\u02c7\u02c6˙\n\n\n˘\u02c7\u02c6˙\n\n"
  74. @pytest.mark.parametrize("degree", [0, 90, 180, 270, 360, -90])
  75. def test_rotate(degree):
  76. with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile:
  77. ipdf = PdfFileReader(inputfile)
  78. page = ipdf.getPage(0)
  79. page.rotateCounterClockwise(degree)
  80. def test_rotate_45():
  81. with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile:
  82. ipdf = PdfFileReader(inputfile)
  83. page = ipdf.getPage(0)
  84. with pytest.raises(AssertionError):
  85. page.rotateCounterClockwise(45)