ai
/
nsq_convert2txt


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
							import os
import binascii
import sys
import pytest

from PyPDF2 import PdfFileReader, PdfFileWriter

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources")

sys.path.append(PROJECT_ROOT)


def test_PdfReaderFileLoad():
    """
    Test loading and parsing of a file. Extract text of the file and compare to expected
    textual output. Expected outcome: file loads, text matches expected.
    """

    with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile:
        # Load PDF file from file
        ipdf = PdfFileReader(inputfile)
        ipdf_p1 = ipdf.getPage(0)

        # Retrieve the text of the PDF
        with open(os.path.join(RESOURCE_ROOT, "crazyones.txt"), "rb") as pdftext_file:
            pdftext = pdftext_file.read()

        ipdf_p1_text = ipdf_p1.extractText().replace("\n", "").encode("utf-8")

        # Compare the text of the PDF to a known source
        assert ipdf_p1_text == pdftext, (
            "PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n"
            % (pdftext, ipdf_p1_text)
        )


def test_PdfReaderJpegImage():
    """
    Test loading and parsing of a file. Extract the image of the file and compare to expected
    textual output. Expected outcome: file loads, image matches expected.
    """

    with open(os.path.join(RESOURCE_ROOT, "jpeg.pdf"), "rb") as inputfile:
        # Load PDF file from file
        ipdf = PdfFileReader(inputfile)

        # Retrieve the text of the image
        with open(os.path.join(RESOURCE_ROOT, "jpeg.txt"), "r") as pdftext_file:
            imagetext = pdftext_file.read()

        ipdf_p0 = ipdf.getPage(0)
        xObject = ipdf_p0["/Resources"]["/XObject"].getObject()
        data = xObject["/Im4"].getData()

        # Compare the text of the PDF to a known source
        assert binascii.hexlify(data).decode() == imagetext, (
            "PDF extracted image differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n"
            % (imagetext, binascii.hexlify(data).decode())
        )


def test_read_metadata():
    with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile:
        ipdf = PdfFileReader(inputfile)
        metadict = ipdf.getDocumentInfo()
        assert metadict.title is None
        assert dict(metadict) == {
            "/CreationDate": "D:20150604133406-06'00'",
            "/Creator": " XeTeX output 2015.06.04:1334",
            "/Producer": "xdvipdfmx (20140317)",
        }


def test_decrypt():
    with open(
        os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf"), "rb"
    ) as inputfile:
        ipdf = PdfFileReader(inputfile)
        assert ipdf.isEncrypted == True
        ipdf.decrypt("openpassword")
        assert ipdf.getNumPages() == 1
        assert ipdf.isEncrypted == True
        metadict = ipdf.getDocumentInfo()
        assert dict(metadict) == {
            "/CreationDate": "D:20220403203552+02'00'",
            "/Creator": "Writer",
            "/Producer": "LibreOffice 6.4",
        }
        # Is extractText() broken for encrypted files?
        # assert ipdf.getPage(0).extractText().replace('\n', '') == "\n˘\n\u02c7\u02c6˙\n\n\n˘\u02c7\u02c6˙\n\n"


@pytest.mark.parametrize("degree", [0, 90, 180, 270, 360, -90])
def test_rotate(degree):
    with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile:
        ipdf = PdfFileReader(inputfile)
        page = ipdf.getPage(0)
        page.rotateCounterClockwise(degree)


def test_rotate_45():
    with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile:
        ipdf = PdfFileReader(inputfile)
        page = ipdf.getPage(0)
        with pytest.raises(AssertionError):
            page.rotateCounterClockwise(45)