ai
/
nsq_convert2txt


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
							import os
import pytest
import PyPDF2

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources")


@pytest.mark.parametrize(
    "src",
    [
        (os.path.join(RESOURCE_ROOT, "crazyones.pdf")),
        (os.path.join(RESOURCE_ROOT, "commented.pdf")),
    ],
)
def test_get_annotations(src):
    reader = PyPDF2.PdfFileReader(open(src, "rb"))

    for i in range(reader.getNumPages()):
        page = reader.getPage(i)
        print("/Annots" in page)
        if "/Annots" in page:
            for annot in page["/Annots"]:
                subtype = annot.getObject()["/Subtype"]
                if subtype == "/Text":
                    print(annot.getObject()["/Contents"])
                    print("")


@pytest.mark.parametrize(
    "src",
    [
        (os.path.join(RESOURCE_ROOT, "attachment.pdf")),
        (os.path.join(RESOURCE_ROOT, "crazyones.pdf")),
    ],
)
def test_get_attachments(src):
    reader = PyPDF2.PdfFileReader(open(src, "rb"))

    attachments = {}
    for i in range(reader.getNumPages()):
        page = reader.getPage(i)
        if "/Annots" in page:
            for annotation in page["/Annots"]:
                annotobj = annotation.getObject()
                if annotobj["/Subtype"] == "/FileAttachment":
                    fileobj = annotobj["/FS"]
                    attachments[fileobj["/F"]] = fileobj["/EF"]["/F"].getData()
    return attachments


@pytest.mark.parametrize(
    "src,outline_elements",
    [
        (os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf"), 9),
        (os.path.join(RESOURCE_ROOT, "crazyones.pdf"), 0),
    ],
)
def test_get_outlines(src, outline_elements):
    reader = PyPDF2.PdfFileReader(open(src, "rb"))
    outlines = reader.getOutlines()
    assert len(outlines) == outline_elements


@pytest.mark.parametrize(
    "src,nb_images",
    [
        (os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf"), 0),
        (os.path.join(RESOURCE_ROOT, "crazyones.pdf"), 0),
        (os.path.join(RESOURCE_ROOT, "git.pdf"), 1),
    ],
)
def test_get_images(src, nb_images):
    from PIL import Image

    input1 = PyPDF2.PdfFileReader(open(src, "rb"))
    page0 = input1.getPage(0)

    images_extracted = []

    if "/XObject" in page0["/Resources"]:
        xObject = page0["/Resources"]["/XObject"].getObject()

        for obj in xObject:
            if xObject[obj]["/Subtype"] == "/Image":
                size = (xObject[obj]["/Width"], xObject[obj]["/Height"])
                data = xObject[obj].getData()
                if xObject[obj]["/ColorSpace"] == "/DeviceRGB":
                    mode = "RGB"
                else:
                    mode = "P"

                filename = None
                if "/Filter" in xObject[obj]:
                    if xObject[obj]["/Filter"] == "/FlateDecode":
                        img = Image.frombytes(mode, size, data)
                        if "/SMask" in xObject[obj]:  # add alpha channel
                            alpha = Image.frombytes(
                                "L", size, xObject[obj]["/SMask"].getData()
                            )
                            img.putalpha(alpha)
                        filename = obj[1:] + ".png"
                        img.save(filename)
                    elif xObject[obj]["/Filter"] == "/DCTDecode":
                        filename = obj[1:] + ".jpg"
                        img = open(filename, "wb")
                        img.write(data)
                        img.close()
                    elif xObject[obj]["/Filter"] == "/JPXDecode":
                        filename = obj[1:] + ".jp2"
                        img = open(filename, "wb")
                        img.write(data)
                        img.close()
                    elif xObject[obj]["/Filter"] == "/CCITTFaxDecode":
                        filename = obj[1:] + ".tiff"
                        img = open(filename, "wb")
                        img.write(data)
                        img.close()
                else:
                    img = Image.frombytes(mode, size, data)
                    filename = obj[1:] + ".png"
                    img.save(filename)
                if filename is not None:
                    images_extracted.append(filename)
    else:
        print("No image found.")

    assert len(images_extracted) == nb_images