test_reader.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. import os
  2. import pytest
  3. import PyPDF2
  4. TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
  5. PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
  6. RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources")
  7. @pytest.mark.parametrize(
  8. "src",
  9. [
  10. (os.path.join(RESOURCE_ROOT, "crazyones.pdf")),
  11. (os.path.join(RESOURCE_ROOT, "commented.pdf")),
  12. ],
  13. )
  14. def test_get_annotations(src):
  15. reader = PyPDF2.PdfFileReader(open(src, "rb"))
  16. for i in range(reader.getNumPages()):
  17. page = reader.getPage(i)
  18. print("/Annots" in page)
  19. if "/Annots" in page:
  20. for annot in page["/Annots"]:
  21. subtype = annot.getObject()["/Subtype"]
  22. if subtype == "/Text":
  23. print(annot.getObject()["/Contents"])
  24. print("")
  25. @pytest.mark.parametrize(
  26. "src",
  27. [
  28. (os.path.join(RESOURCE_ROOT, "attachment.pdf")),
  29. (os.path.join(RESOURCE_ROOT, "crazyones.pdf")),
  30. ],
  31. )
  32. def test_get_attachments(src):
  33. reader = PyPDF2.PdfFileReader(open(src, "rb"))
  34. attachments = {}
  35. for i in range(reader.getNumPages()):
  36. page = reader.getPage(i)
  37. if "/Annots" in page:
  38. for annotation in page["/Annots"]:
  39. annotobj = annotation.getObject()
  40. if annotobj["/Subtype"] == "/FileAttachment":
  41. fileobj = annotobj["/FS"]
  42. attachments[fileobj["/F"]] = fileobj["/EF"]["/F"].getData()
  43. return attachments
  44. @pytest.mark.parametrize(
  45. "src,outline_elements",
  46. [
  47. (os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf"), 9),
  48. (os.path.join(RESOURCE_ROOT, "crazyones.pdf"), 0),
  49. ],
  50. )
  51. def test_get_outlines(src, outline_elements):
  52. reader = PyPDF2.PdfFileReader(open(src, "rb"))
  53. outlines = reader.getOutlines()
  54. assert len(outlines) == outline_elements
  55. @pytest.mark.parametrize(
  56. "src,nb_images",
  57. [
  58. (os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf"), 0),
  59. (os.path.join(RESOURCE_ROOT, "crazyones.pdf"), 0),
  60. (os.path.join(RESOURCE_ROOT, "git.pdf"), 1),
  61. ],
  62. )
  63. def test_get_images(src, nb_images):
  64. from PIL import Image
  65. input1 = PyPDF2.PdfFileReader(open(src, "rb"))
  66. page0 = input1.getPage(0)
  67. images_extracted = []
  68. if "/XObject" in page0["/Resources"]:
  69. xObject = page0["/Resources"]["/XObject"].getObject()
  70. for obj in xObject:
  71. if xObject[obj]["/Subtype"] == "/Image":
  72. size = (xObject[obj]["/Width"], xObject[obj]["/Height"])
  73. data = xObject[obj].getData()
  74. if xObject[obj]["/ColorSpace"] == "/DeviceRGB":
  75. mode = "RGB"
  76. else:
  77. mode = "P"
  78. filename = None
  79. if "/Filter" in xObject[obj]:
  80. if xObject[obj]["/Filter"] == "/FlateDecode":
  81. img = Image.frombytes(mode, size, data)
  82. if "/SMask" in xObject[obj]: # add alpha channel
  83. alpha = Image.frombytes(
  84. "L", size, xObject[obj]["/SMask"].getData()
  85. )
  86. img.putalpha(alpha)
  87. filename = obj[1:] + ".png"
  88. img.save(filename)
  89. elif xObject[obj]["/Filter"] == "/DCTDecode":
  90. filename = obj[1:] + ".jpg"
  91. img = open(filename, "wb")
  92. img.write(data)
  93. img.close()
  94. elif xObject[obj]["/Filter"] == "/JPXDecode":
  95. filename = obj[1:] + ".jp2"
  96. img = open(filename, "wb")
  97. img.write(data)
  98. img.close()
  99. elif xObject[obj]["/Filter"] == "/CCITTFaxDecode":
  100. filename = obj[1:] + ".tiff"
  101. img = open(filename, "wb")
  102. img.write(data)
  103. img.close()
  104. else:
  105. img = Image.frombytes(mode, size, data)
  106. filename = obj[1:] + ".png"
  107. img.save(filename)
  108. if filename is not None:
  109. images_extracted.append(filename)
  110. else:
  111. print("No image found.")
  112. assert len(images_extracted) == nb_images