1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070 |
- # -*- coding: utf-8 -*-
- #
- # vim: sw=4:expandtab:foldmethod=marker
- #
- # Copyright (c) 2006, Mathieu Fenniak
- # Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
- #
- # All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are
- # met:
- #
- # * Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- # * Redistributions in binary form must reproduce the above copyright notice,
- # this list of conditions and the following disclaimer in the documentation
- # and/or other materials provided with the distribution.
- # * The name of the author may not be used to endorse or promote products
- # derived from this software without specific prior written permission.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- # POSSIBILITY OF SUCH DAMAGE.
- """
- A pure-Python PDF library with an increasing number of capabilities.
- See README for links to FAQ, documentation, homepage, etc.
- """
- __author__ = "Mathieu Fenniak"
- __author_email__ = "biziqe@mathieu.fenniak.net"
- __maintainer__ = "Phaseit, Inc."
- __maintainer_email = "PyPDF2@phaseit.net"
- import math
- import struct
- import sys
- from sys import version_info
- if version_info < ( 3, 0 ):
- from cStringIO import StringIO
- else:
- from io import StringIO
- if version_info < ( 3, 0 ):
- BytesIO = StringIO
- else:
- from io import BytesIO
- from .generic import *
- from .utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
- from .utils import isString, b_, u_, ord_, str_, formatWarning
- if version_info < ( 2, 4 ):
- from sets import ImmutableSet as frozenset
- if version_info < ( 2, 5 ):
- from md5 import md5
- else:
- from hashlib import md5
- import uuid
- class PdfFileWriter(object):
- """
- This class supports writing PDF files out, given pages produced by another
- class (typically :class:`PdfFileReader<PdfFileReader>`).
- """
- def __init__(self):
- self._header = b_("%PDF-1.3")
- self._objects = [] # array of indirect objects
- # The root of our page tree node.
- pages = DictionaryObject()
- pages.update({
- NameObject("/Type"): NameObject("/Pages"),
- NameObject("/Count"): NumberObject(0),
- NameObject("/Kids"): ArrayObject(),
- })
- self._pages = self._addObject(pages)
- # info object
- info = DictionaryObject()
- info.update({
- NameObject("/Producer"): createStringObject(codecs.BOM_UTF16_BE + u_("PyPDF2").encode('utf-16be'))
- })
- self._info = self._addObject(info)
- # root object
- root = DictionaryObject()
- root.update({
- NameObject("/Type"): NameObject("/Catalog"),
- NameObject("/Pages"): self._pages,
- })
- self._root = None
- self._root_object = root
- def _addObject(self, obj):
- self._objects.append(obj)
- return IndirectObject(len(self._objects), 0, self)
- def getObject(self, ido):
- if ido.pdf != self:
- raise ValueError("pdf must be self")
- return self._objects[ido.idnum - 1]
- def _addPage(self, page, action):
- assert page["/Type"] == "/Page"
- page[NameObject("/Parent")] = self._pages
- page = self._addObject(page)
- pages = self.getObject(self._pages)
- action(pages["/Kids"], page)
- pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1)
- def addPage(self, page):
- """
- Adds a page to this PDF file. The page is usually acquired from a
- :class:`PdfFileReader<PdfFileReader>` instance.
- :param PageObject page: The page to add to the document. Should be
- an instance of :class:`PageObject<PyPDF2.pdf.PageObject>`
- """
- self._addPage(page, list.append)
- def insertPage(self, page, index=0):
- """
- Insert a page in this PDF file. The page is usually acquired from a
- :class:`PdfFileReader<PdfFileReader>` instance.
- :param PageObject page: The page to add to the document. This
- argument should be an instance of :class:`PageObject<pdf.PageObject>`.
- :param int index: Position at which the page will be inserted.
- """
- self._addPage(page, lambda l, p: l.insert(index, p))
- def getPage(self, pageNumber):
- """
- Retrieves a page by number from this PDF file.
- :param int pageNumber: The page number to retrieve
- (pages begin at zero)
- :return: the page at the index given by *pageNumber*
- :rtype: :class:`PageObject<pdf.PageObject>`
- """
- pages = self.getObject(self._pages)
- # XXX: crude hack
- return pages["/Kids"][pageNumber].getObject()
- def getNumPages(self):
- """
- :return: the number of pages.
- :rtype: int
- """
- pages = self.getObject(self._pages)
- return int(pages[NameObject("/Count")])
- def addBlankPage(self, width=None, height=None):
- """
- Appends a blank page to this PDF file and returns it. If no page size
- is specified, use the size of the last page.
- :param float width: The width of the new page expressed in default user
- space units.
- :param float height: The height of the new page expressed in default
- user space units.
- :return: the newly appended page
- :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>`
- :raises PageSizeNotDefinedError: if width and height are not defined
- and previous page does not exist.
- """
- page = PageObject.createBlankPage(self, width, height)
- self.addPage(page)
- return page
- def insertBlankPage(self, width=None, height=None, index=0):
- """
- Inserts a blank page to this PDF file and returns it. If no page size
- is specified, use the size of the last page.
- :param float width: The width of the new page expressed in default user
- space units.
- :param float height: The height of the new page expressed in default
- user space units.
- :param int index: Position to add the page.
- :return: the newly appended page
- :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>`
- :raises PageSizeNotDefinedError: if width and height are not defined
- and previous page does not exist.
- """
- if width is None or height is None and \
- (self.getNumPages() - 1) >= index:
- oldpage = self.getPage(index)
- width = oldpage.mediaBox.getWidth()
- height = oldpage.mediaBox.getHeight()
- page = PageObject.createBlankPage(self, width, height)
- self.insertPage(page, index)
- return page
- def addJS(self, javascript):
- """
- Add Javascript which will launch upon opening this PDF.
- :param str javascript: Your Javascript.
- >>> output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
- # Example: This will launch the print window when the PDF is opened.
- """
- js = DictionaryObject()
- js.update({
- NameObject("/Type"): NameObject("/Action"),
- NameObject("/S"): NameObject("/JavaScript"),
- NameObject("/JS"): NameObject("(%s)" % javascript)
- })
- js_indirect_object = self._addObject(js)
- # We need a name for parameterized javascript in the pdf file, but it can be anything.
- js_string_name = str(uuid.uuid4())
- js_name_tree = DictionaryObject()
- js_name_tree.update({
- NameObject("/JavaScript"): DictionaryObject({
- NameObject("/Names"): ArrayObject([createStringObject(js_string_name), js_indirect_object])
- })
- })
- self._addObject(js_name_tree)
- self._root_object.update({
- NameObject("/OpenAction"): js_indirect_object,
- NameObject("/Names"): js_name_tree
- })
- def addAttachment(self, fname, fdata):
- """
- Embed a file inside the PDF.
- :param str fname: The filename to display.
- :param str fdata: The data in the file.
- Reference:
- https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf
- Section 7.11.3
- """
- # We need 3 entries:
- # * The file's data
- # * The /Filespec entry
- # * The file's name, which goes in the Catalog
- # The entry for the file
- """ Sample:
- 8 0 obj
- <<
- /Length 12
- /Type /EmbeddedFile
- >>
- stream
- Hello world!
- endstream
- endobj
- """
- file_entry = DecodedStreamObject()
- file_entry.setData(fdata)
- file_entry.update({
- NameObject("/Type"): NameObject("/EmbeddedFile")
- })
- # The Filespec entry
- """ Sample:
- 7 0 obj
- <<
- /Type /Filespec
- /F (hello.txt)
- /EF << /F 8 0 R >>
- >>
- """
- efEntry = DictionaryObject()
- efEntry.update({ NameObject("/F"):file_entry })
- filespec = DictionaryObject()
- filespec.update({
- NameObject("/Type"): NameObject("/Filespec"),
- NameObject("/F"): createStringObject(fname), # Perhaps also try TextStringObject
- NameObject("/EF"): efEntry
- })
- # Then create the entry for the root, as it needs a reference to the Filespec
- """ Sample:
- 1 0 obj
- <<
- /Type /Catalog
- /Outlines 2 0 R
- /Pages 3 0 R
- /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >>
- >>
- endobj
- """
- embeddedFilesNamesDictionary = DictionaryObject()
- embeddedFilesNamesDictionary.update({
- NameObject("/Names"): ArrayObject([createStringObject(fname), filespec])
- })
- embeddedFilesDictionary = DictionaryObject()
- embeddedFilesDictionary.update({
- NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary
- })
- # Update the root
- self._root_object.update({
- NameObject("/Names"): embeddedFilesDictionary
- })
- def appendPagesFromReader(self, reader, after_page_append=None):
- """
- Copy pages from reader to writer. Includes an optional callback parameter
- which is invoked after pages are appended to the writer.
- :param reader: a PdfFileReader object from which to copy page
- annotations to this writer object. The writer's annots
- will then be updated
- :callback after_page_append (function): Callback function that is invoked after
- each page is appended to the writer. Callback signature:
- :param writer_pageref (PDF page reference): Reference to the page
- appended to the writer.
- """
- # Get page count from writer and reader
- reader_num_pages = reader.getNumPages()
- writer_num_pages = self.getNumPages()
- # Copy pages from reader to writer
- for rpagenum in range(0, reader_num_pages):
- reader_page = reader.getPage(rpagenum)
- self.addPage(reader_page)
- writer_page = self.getPage(writer_num_pages+rpagenum)
- # Trigger callback, pass writer page as parameter
- if callable(after_page_append): after_page_append(writer_page)
- def updatePageFormFieldValues(self, page, fields):
- '''
- Update the form field values for a given page from a fields dictionary.
- Copy field texts and values from fields to page.
- :param page: Page reference from PDF writer where the annotations
- and field data will be updated.
- :param fields: a Python dictionary of field names (/T) and text
- values (/V)
- '''
- # Iterate through pages, update field values
- for j in range(0, len(page['/Annots'])):
- writer_annot = page['/Annots'][j].getObject()
- for field in fields:
- if writer_annot.get('/T') == field:
- writer_annot.update({
- NameObject("/V"): TextStringObject(fields[field])
- })
- def cloneReaderDocumentRoot(self, reader):
- '''
- Copy the reader document root to the writer.
- :param reader: PdfFileReader from the document root should be copied.
- :callback after_page_append
- '''
- self._root_object = reader.trailer['/Root']
- def cloneDocumentFromReader(self, reader, after_page_append=None):
- '''
- Create a copy (clone) of a document from a PDF file reader
- :param reader: PDF file reader instance from which the clone
- should be created.
- :callback after_page_append (function): Callback function that is invoked after
- each page is appended to the writer. Signature includes a reference to the
- appended page (delegates to appendPagesFromReader). Callback signature:
- :param writer_pageref (PDF page reference): Reference to the page just
- appended to the document.
- '''
- self.cloneReaderDocumentRoot(reader)
- self.appendPagesFromReader(reader, after_page_append)
- def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
- """
- Encrypt this PDF file with the PDF Standard encryption handler.
- :param str user_pwd: The "user password", which allows for opening
- and reading the PDF file with the restrictions provided.
- :param str owner_pwd: The "owner password", which allows for
- opening the PDF files without any restrictions. By default,
- the owner password is the same as the user password.
- :param bool use_128bit: flag as to whether to use 128bit
- encryption. When false, 40bit encryption will be used. By default,
- this flag is on.
- """
- import time, random
- if owner_pwd == None:
- owner_pwd = user_pwd
- if use_128bit:
- V = 2
- rev = 3
- keylen = int(128 / 8)
- else:
- V = 1
- rev = 2
- keylen = int(40 / 8)
- # permit everything:
- P = -1
- O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
- ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest())
- ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest())
- self._ID = ArrayObject((ID_1, ID_2))
- if rev == 2:
- U, key = _alg34(user_pwd, O, P, ID_1)
- else:
- assert rev == 3
- U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False)
- encrypt = DictionaryObject()
- encrypt[NameObject("/Filter")] = NameObject("/Standard")
- encrypt[NameObject("/V")] = NumberObject(V)
- if V == 2:
- encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
- encrypt[NameObject("/R")] = NumberObject(rev)
- encrypt[NameObject("/O")] = ByteStringObject(O)
- encrypt[NameObject("/U")] = ByteStringObject(U)
- encrypt[NameObject("/P")] = NumberObject(P)
- self._encrypt = self._addObject(encrypt)
- self._encrypt_key = key
- def write(self, stream):
- """
- Writes the collection of pages added to this object out as a PDF file.
- :param stream: An object to write the file to. The object must support
- the write method and the tell method, similar to a file object.
- """
- if hasattr(stream, 'mode') and 'b' not in stream.mode:
- warnings.warn("File <%s> to write to is not in binary mode. It may not be written to correctly." % stream.name)
- debug = False
- import struct
- if not self._root:
- self._root = self._addObject(self._root_object)
- externalReferenceMap = {}
- # PDF objects sometimes have circular references to their /Page objects
- # inside their object tree (for example, annotations). Those will be
- # indirect references to objects that we've recreated in this PDF. To
- # address this problem, PageObject's store their original object
- # reference number, and we add it to the external reference map before
- # we sweep for indirect references. This forces self-page-referencing
- # trees to reference the correct new object location, rather than
- # copying in a new copy of the page object.
- for objIndex in range(len(self._objects)):
- obj = self._objects[objIndex]
- if isinstance(obj, PageObject) and obj.indirectRef != None:
- data = obj.indirectRef
- if data.pdf not in externalReferenceMap:
- externalReferenceMap[data.pdf] = {}
- if data.generation not in externalReferenceMap[data.pdf]:
- externalReferenceMap[data.pdf][data.generation] = {}
- externalReferenceMap[data.pdf][data.generation][data.idnum] = IndirectObject(objIndex + 1, 0, self)
- self.stack = []
- if debug: print(("ERM:", externalReferenceMap, "root:", self._root))
- self._sweepIndirectReferences(externalReferenceMap, self._root)
- del self.stack
- # Begin writing:
- object_positions = []
- stream.write(self._header + b_("\n"))
- stream.write(b_("%\xE2\xE3\xCF\xD3\n"))
- for i in range(len(self._objects)):
- idnum = (i + 1)
- obj = self._objects[i]
- object_positions.append(stream.tell())
- stream.write(b_(str(idnum) + " 0 obj\n"))
- key = None
- if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum:
- pack1 = struct.pack("<i", i + 1)[:3]
- pack2 = struct.pack("<i", 0)[:2]
- key = self._encrypt_key + pack1 + pack2
- assert len(key) == (len(self._encrypt_key) + 5)
- md5_hash = md5(key).digest()
- key = md5_hash[:min(16, len(self._encrypt_key) + 5)]
- obj.writeToStream(stream, key)
- stream.write(b_("\nendobj\n"))
- # xref table
- xref_location = stream.tell()
- stream.write(b_("xref\n"))
- stream.write(b_("0 %s\n" % (len(self._objects) + 1)))
- stream.write(b_("%010d %05d f \n" % (0, 65535)))
- for offset in object_positions:
- stream.write(b_("%010d %05d n \n" % (offset, 0)))
- # trailer
- stream.write(b_("trailer\n"))
- trailer = DictionaryObject()
- trailer.update({
- NameObject("/Size"): NumberObject(len(self._objects) + 1),
- NameObject("/Root"): self._root,
- NameObject("/Info"): self._info,
- })
- if hasattr(self, "_ID"):
- trailer[NameObject("/ID")] = self._ID
- if hasattr(self, "_encrypt"):
- trailer[NameObject("/Encrypt")] = self._encrypt
- trailer.writeToStream(stream, None)
- # eof
- stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)))
- def addMetadata(self, infos):
- """
- Add custom metadata to the output.
- :param dict infos: a Python dictionary where each key is a field
- and each value is your new metadata.
- """
- args = {}
- for key, value in list(infos.items()):
- args[NameObject(key)] = createStringObject(value)
- self.getObject(self._info).update(args)
- def _sweepIndirectReferences(self, externMap, data):
- debug = False
- if debug: print((data, "TYPE", data.__class__.__name__))
- if isinstance(data, DictionaryObject):
- for key, value in list(data.items()):
- origvalue = value
- value = self._sweepIndirectReferences(externMap, value)
- if isinstance(value, StreamObject):
- # a dictionary value is a stream. streams must be indirect
- # objects, so we need to change this value.
- value = self._addObject(value)
- data[key] = value
- return data
- elif isinstance(data, ArrayObject):
- for i in range(len(data)):
- value = self._sweepIndirectReferences(externMap, data[i])
- if isinstance(value, StreamObject):
- # an array value is a stream. streams must be indirect
- # objects, so we need to change this value
- value = self._addObject(value)
- data[i] = value
- return data
- elif isinstance(data, IndirectObject):
- # internal indirect references are fine
- if data.pdf == self:
- if data.idnum in self.stack:
- return data
- else:
- self.stack.append(data.idnum)
- realdata = self.getObject(data)
- self._sweepIndirectReferences(externMap, realdata)
- return data
- else:
- if data.pdf.stream.closed:
- raise ValueError("I/O operation on closed file: {}".format(data.pdf.stream.name))
- newobj = externMap.get(data.pdf, {}).get(data.generation, {}).get(data.idnum, None)
- if newobj == None:
- try:
- newobj = data.pdf.getObject(data)
- self._objects.append(None) # placeholder
- idnum = len(self._objects)
- newobj_ido = IndirectObject(idnum, 0, self)
- if data.pdf not in externMap:
- externMap[data.pdf] = {}
- if data.generation not in externMap[data.pdf]:
- externMap[data.pdf][data.generation] = {}
- externMap[data.pdf][data.generation][data.idnum] = newobj_ido
- newobj = self._sweepIndirectReferences(externMap, newobj)
- self._objects[idnum-1] = newobj
- return newobj_ido
- except ValueError:
- # Unable to resolve the Object, returning NullObject instead.
- warnings.warn("Unable to resolve [{}: {}], returning NullObject instead".format(
- data.__class__.__name__, data
- ))
- return NullObject()
- return newobj
- else:
- return data
- def getReference(self, obj):
- idnum = self._objects.index(obj) + 1
- ref = IndirectObject(idnum, 0, self)
- assert ref.getObject() == obj
- return ref
- def getOutlineRoot(self):
- if '/Outlines' in self._root_object:
- outline = self._root_object['/Outlines']
- idnum = self._objects.index(outline) + 1
- outlineRef = IndirectObject(idnum, 0, self)
- assert outlineRef.getObject() == outline
- else:
- outline = TreeObject()
- outline.update({ })
- outlineRef = self._addObject(outline)
- self._root_object[NameObject('/Outlines')] = outlineRef
- return outline
- def getNamedDestRoot(self):
- if '/Names' in self._root_object and isinstance(self._root_object['/Names'], DictionaryObject):
- names = self._root_object['/Names']
- idnum = self._objects.index(names) + 1
- namesRef = IndirectObject(idnum, 0, self)
- assert namesRef.getObject() == names
- if '/Dests' in names and isinstance(names['/Dests'], DictionaryObject):
- dests = names['/Dests']
- idnum = self._objects.index(dests) + 1
- destsRef = IndirectObject(idnum, 0, self)
- assert destsRef.getObject() == dests
- if '/Names' in dests:
- nd = dests['/Names']
- else:
- nd = ArrayObject()
- dests[NameObject('/Names')] = nd
- else:
- dests = DictionaryObject()
- destsRef = self._addObject(dests)
- names[NameObject('/Dests')] = destsRef
- nd = ArrayObject()
- dests[NameObject('/Names')] = nd
- else:
- names = DictionaryObject()
- namesRef = self._addObject(names)
- self._root_object[NameObject('/Names')] = namesRef
- dests = DictionaryObject()
- destsRef = self._addObject(dests)
- names[NameObject('/Dests')] = destsRef
- nd = ArrayObject()
- dests[NameObject('/Names')] = nd
- return nd
- def addBookmarkDestination(self, dest, parent=None):
- destRef = self._addObject(dest)
- outlineRef = self.getOutlineRoot()
- if parent == None:
- parent = outlineRef
- parent = parent.getObject()
- #print parent.__class__.__name__
- parent.addChild(destRef, self)
- return destRef
- def addBookmarkDict(self, bookmark, parent=None):
- bookmarkObj = TreeObject()
- for k, v in list(bookmark.items()):
- bookmarkObj[NameObject(str(k))] = v
- bookmarkObj.update(bookmark)
- if '/A' in bookmark:
- action = DictionaryObject()
- for k, v in list(bookmark['/A'].items()):
- action[NameObject(str(k))] = v
- actionRef = self._addObject(action)
- bookmarkObj[NameObject('/A')] = actionRef
- bookmarkRef = self._addObject(bookmarkObj)
- outlineRef = self.getOutlineRoot()
- if parent == None:
- parent = outlineRef
- parent = parent.getObject()
- parent.addChild(bookmarkRef, self)
- return bookmarkRef
- def addBookmark(self, title, pagenum, parent=None, color=None, bold=False, italic=False, fit='/Fit', *args):
- """
- Add a bookmark to this PDF file.
- :param str title: Title to use for this bookmark.
- :param int pagenum: Page number this bookmark will point to.
- :param parent: A reference to a parent bookmark to create nested
- bookmarks.
- :param tuple color: Color of the bookmark as a red, green, blue tuple
- from 0.0 to 1.0
- :param bool bold: Bookmark is bold
- :param bool italic: Bookmark is italic
- :param str fit: The fit of the destination page. See
- :meth:`addLink()<addLink>` for details.
- """
- pageRef = self.getObject(self._pages)['/Kids'][pagenum]
- action = DictionaryObject()
- zoomArgs = []
- for a in args:
- if a is not None:
- zoomArgs.append(NumberObject(a))
- else:
- zoomArgs.append(NullObject())
- dest = Destination(NameObject("/"+title + " bookmark"), pageRef, NameObject(fit), *zoomArgs)
- destArray = dest.getDestArray()
- action.update({
- NameObject('/D') : destArray,
- NameObject('/S') : NameObject('/GoTo')
- })
- actionRef = self._addObject(action)
- outlineRef = self.getOutlineRoot()
- if parent == None:
- parent = outlineRef
- bookmark = TreeObject()
- bookmark.update({
- NameObject('/A'): actionRef,
- NameObject('/Title'): createStringObject(title),
- })
- if color is not None:
- bookmark.update({NameObject('/C'): ArrayObject([FloatObject(c) for c in color])})
- format = 0
- if italic:
- format += 1
- if bold:
- format += 2
- if format:
- bookmark.update({NameObject('/F'): NumberObject(format)})
- bookmarkRef = self._addObject(bookmark)
- parent = parent.getObject()
- parent.addChild(bookmarkRef, self)
- return bookmarkRef
- def addNamedDestinationObject(self, dest):
- destRef = self._addObject(dest)
- nd = self.getNamedDestRoot()
- nd.extend([dest['/Title'], destRef])
- return destRef
- def addNamedDestination(self, title, pagenum):
- pageRef = self.getObject(self._pages)['/Kids'][pagenum]
- dest = DictionaryObject()
- dest.update({
- NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]),
- NameObject('/S') : NameObject('/GoTo')
- })
- destRef = self._addObject(dest)
- nd = self.getNamedDestRoot()
- nd.extend([title, destRef])
- return destRef
- def removeLinks(self):
- """
- Removes links and annotations from this output.
- """
- pages = self.getObject(self._pages)['/Kids']
- for page in pages:
- pageRef = self.getObject(page)
- if "/Annots" in pageRef:
- del pageRef['/Annots']
- def removeImages(self, ignoreByteStringObject=False):
- """
- Removes images from this output.
- :param bool ignoreByteStringObject: optional parameter
- to ignore ByteString Objects.
- """
- pages = self.getObject(self._pages)['/Kids']
- for j in range(len(pages)):
- page = pages[j]
- pageRef = self.getObject(page)
- content = pageRef['/Contents'].getObject()
- if not isinstance(content, ContentStream):
- content = ContentStream(content, pageRef)
- _operations = []
- seq_graphics = False
- for operands, operator in content.operations:
- if operator == b_('Tj'):
- text = operands[0]
- if ignoreByteStringObject:
- if not isinstance(text, TextStringObject):
- operands[0] = TextStringObject()
- elif operator == b_("'"):
- text = operands[0]
- if ignoreByteStringObject:
- if not isinstance(text, TextStringObject):
- operands[0] = TextStringObject()
- elif operator == b_('"'):
- text = operands[2]
- if ignoreByteStringObject:
- if not isinstance(text, TextStringObject):
- operands[2] = TextStringObject()
- elif operator == b_("TJ"):
- for i in range(len(operands[0])):
- if ignoreByteStringObject:
- if not isinstance(operands[0][i], TextStringObject):
- operands[0][i] = TextStringObject()
- if operator == b_('q'):
- seq_graphics = True
- if operator == b_('Q'):
- seq_graphics = False
- if seq_graphics:
- if operator in [b_('cm'), b_('w'), b_('J'), b_('j'), b_('M'), b_('d'), b_('ri'), b_('i'),
- b_('gs'), b_('W'), b_('b'), b_('s'), b_('S'), b_('f'), b_('F'), b_('n'), b_('m'), b_('l'),
- b_('c'), b_('v'), b_('y'), b_('h'), b_('B'), b_('Do'), b_('sh')]:
- continue
- if operator == b_('re'):
- continue
- _operations.append((operands, operator))
- content.operations = _operations
- pageRef.__setitem__(NameObject('/Contents'), content)
- def removeText(self, ignoreByteStringObject=False):
- """
- Removes text from this output.
- :param bool ignoreByteStringObject: optional parameter
- to ignore ByteString Objects.
- """
- pages = self.getObject(self._pages)['/Kids']
- for j in range(len(pages)):
- page = pages[j]
- pageRef = self.getObject(page)
- content = pageRef['/Contents'].getObject()
- if not isinstance(content, ContentStream):
- content = ContentStream(content, pageRef)
- for operands,operator in content.operations:
- if operator == b_('Tj'):
- text = operands[0]
- if not ignoreByteStringObject:
- if isinstance(text, TextStringObject):
- operands[0] = TextStringObject()
- else:
- if isinstance(text, TextStringObject) or \
- isinstance(text, ByteStringObject):
- operands[0] = TextStringObject()
- elif operator == b_("'"):
- text = operands[0]
- if not ignoreByteStringObject:
- if isinstance(text, TextStringObject):
- operands[0] = TextStringObject()
- else:
- if isinstance(text, TextStringObject) or \
- isinstance(text, ByteStringObject):
- operands[0] = TextStringObject()
- elif operator == b_('"'):
- text = operands[2]
- if not ignoreByteStringObject:
- if isinstance(text, TextStringObject):
- operands[2] = TextStringObject()
- else:
- if isinstance(text, TextStringObject) or \
- isinstance(text, ByteStringObject):
- operands[2] = TextStringObject()
- elif operator == b_("TJ"):
- for i in range(len(operands[0])):
- if not ignoreByteStringObject:
- if isinstance(operands[0][i], TextStringObject):
- operands[0][i] = TextStringObject()
- else:
- if isinstance(operands[0][i], TextStringObject) or \
- isinstance(operands[0][i], ByteStringObject):
- operands[0][i] = TextStringObject()
- pageRef.__setitem__(NameObject('/Contents'), content)
- def addURI(self, pagenum, uri, rect, border=None):
- """
- Add an URI from a rectangular area to the specified page.
- This uses the basic structure of AddLink
- :param int pagenum: index of the page on which to place the URI action.
- :param int uri: string -- uri of resource to link to.
- :param rect: :class:`RectangleObject<PyPDF2.generic.RectangleObject>` or array of four
- integers specifying the clickable rectangular area
- ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``.
- :param border: if provided, an array describing border-drawing
- properties. See the PDF spec for details. No border will be
- drawn if this argument is omitted.
- REMOVED FIT/ZOOM ARG
- -John Mulligan
- """
- pageLink = self.getObject(self._pages)['/Kids'][pagenum]
- pageRef = self.getObject(pageLink)
- if border is not None:
- borderArr = [NameObject(n) for n in border[:3]]
- if len(border) == 4:
- dashPattern = ArrayObject([NameObject(n) for n in border[3]])
- borderArr.append(dashPattern)
- else:
- borderArr = [NumberObject(2)] * 3
- if isString(rect):
- rect = NameObject(rect)
- elif isinstance(rect, RectangleObject):
- pass
- else:
- rect = RectangleObject(rect)
- lnk2 = DictionaryObject()
- lnk2.update({
- NameObject('/S'): NameObject('/URI'),
- NameObject('/URI'): TextStringObject(uri)
- });
- lnk = DictionaryObject()
- lnk.update({
- NameObject('/Type'): NameObject('/Annot'),
- NameObject('/Subtype'): NameObject('/Link'),
- NameObject('/P'): pageLink,
- NameObject('/Rect'): rect,
- NameObject('/H'): NameObject('/I'),
- NameObject('/Border'): ArrayObject(borderArr),
- NameObject('/A'): lnk2
- })
- lnkRef = self._addObject(lnk)
- if "/Annots" in pageRef:
- pageRef['/Annots'].append(lnkRef)
- else:
- pageRef[NameObject('/Annots')] = ArrayObject([lnkRef])
- def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args):
- """
- Add an internal link from a rectangular area to the specified page.
- :param int pagenum: index of the page on which to place the link.
- :param int pagedest: index of the page to which the link should go.
- :param rect: :class:`RectangleObject<PyPDF2.generic.RectangleObject>` or array of four
- integers specifying the clickable rectangular area
- ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``.
- :param border: if provided, an array describing border-drawing
- properties. See the PDF spec for details. No border will be
- drawn if this argument is omitted.
- :param str fit: Page fit or 'zoom' option (see below). Additional arguments may need
- to be supplied. Passing ``None`` will be read as a null value for that coordinate.
- Valid zoom arguments (see Table 8.2 of the PDF 1.7 reference for details):
- /Fit No additional arguments
- /XYZ [left] [top] [zoomFactor]
- /FitH [top]
- /FitV [left]
- /FitR [left] [bottom] [right] [top]
- /FitB No additional arguments
- /FitBH [top]
- /FitBV [left]
- """
- pageLink = self.getObject(self._pages)['/Kids'][pagenum]
- pageDest = self.getObject(self._pages)['/Kids'][pagedest] #TODO: switch for external link
- pageRef = self.getObject(pageLink)
- if border is not None:
- borderArr = [NameObject(n) for n in border[:3]]
- if len(border) == 4:
- dashPattern = ArrayObject([NameObject(n) for n in border[3]])
- borderArr.append(dashPattern)
- else:
- borderArr = [NumberObject(0)] * 3
- if isString(rect):
- rect = NameObject(rect)
- elif isinstance(rect, RectangleObject):
- pass
- else:
- rect = RectangleObject(rect)
- zoomArgs = []
- for a in args:
- if a is not None:
- zoomArgs.append(NumberObject(a))
- else:
- zoomArgs.append(NullObject())
- dest = Destination(NameObject("/LinkName"), pageDest, NameObject(fit), *zoomArgs) #TODO: create a better name for the link
- destArray = dest.getDestArray()
- lnk = DictionaryObject()
- lnk.update({
- NameObject('/Type'): NameObject('/Annot'),
- NameObject('/Subtype'): NameObject('/Link'),
- NameObject('/P'): pageLink,
- NameObject('/Rect'): rect,
- NameObject('/Border'): ArrayObject(borderArr),
- NameObject('/Dest'): destArray
- })
- lnkRef = self._addObject(lnk)
- if "/Annots" in pageRef:
- pageRef['/Annots'].append(lnkRef)
- else:
- pageRef[NameObject('/Annots')] = ArrayObject([lnkRef])
- _valid_layouts = ['/NoLayout', '/SinglePage', '/OneColumn', '/TwoColumnLeft', '/TwoColumnRight', '/TwoPageLeft', '/TwoPageRight']
- def getPageLayout(self):
- """
- Get the page layout.
- See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` for a description of valid layouts.
- :return: Page layout currently being used.
- :rtype: str, None if not specified
- """
- try:
- return self._root_object['/PageLayout']
- except KeyError:
- return None
- def setPageLayout(self, layout):
- """
- Set the page layout
- :param str layout: The page layout to be used
- Valid layouts are:
- /NoLayout Layout explicitly not specified
- /SinglePage Show one page at a time
- /OneColumn Show one column at a time
- /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left
- /TwoColumnRight Show pages in two columns, odd-numbered pages on the right
- /TwoPageLeft Show two pages at a time, odd-numbered pages on the left
- /TwoPageRight Show two pages at a time, odd-numbered pages on the right
- """
- if not isinstance(layout, NameObject):
- if layout not in self._valid_layouts:
- warnings.warn("Layout should be one of: {}".format(', '.join(self._valid_layouts)))
- layout = NameObject(layout)
- self._root_object.update({NameObject('/PageLayout'): layout})
- pageLayout = property(getPageLayout, setPageLayout)
- """Read and write property accessing the :meth:`getPageLayout()<PdfFileWriter.getPageLayout>`
- and :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` methods."""
- _valid_modes = ['/UseNone', '/UseOutlines', '/UseThumbs', '/FullScreen', '/UseOC', '/UseAttachments']
- def getPageMode(self):
- """
- Get the page mode.
- See :meth:`setPageMode()<PdfFileWriter.setPageMode>` for a description
- of valid modes.
- :return: Page mode currently being used.
- :rtype: str, None if not specified
- """
- try:
- return self._root_object['/PageMode']
- except KeyError:
- return None
- def setPageMode(self, mode):
- """
- Set the page mode.
- :param str mode: The page mode to use.
- Valid modes are:
- /UseNone Do not show outlines or thumbnails panels
- /UseOutlines Show outlines (aka bookmarks) panel
- /UseThumbs Show page thumbnails panel
- /FullScreen Fullscreen view
- /UseOC Show Optional Content Group (OCG) panel
- /UseAttachments Show attachments panel
- """
- if not isinstance(mode, NameObject):
- if mode not in self._valid_modes:
- warnings.warn("Mode should be one of: {}".format(', '.join(self._valid_modes)))
- mode = NameObject(mode)
- self._root_object.update({NameObject('/PageMode'): mode})
- pageMode = property(getPageMode, setPageMode)
- """Read and write property accessing the :meth:`getPageMode()<PdfFileWriter.getPageMode>`
- and :meth:`setPageMode()<PdfFileWriter.setPageMode>` methods."""
- class PdfFileReader(object):
- """
- Initializes a PdfFileReader object. This operation can take some time, as
- the PDF stream's cross-reference tables are read into memory.
- :param stream: A File object or an object that supports the standard read
- and seek methods similar to a File object. Could also be a
- string representing a path to a PDF file.
- :param bool strict: Determines whether user should be warned of all
- problems and also causes some correctable problems to be fatal.
- Defaults to ``True``.
- :param warndest: Destination for logging warnings (defaults to
- ``sys.stderr``).
- :param bool overwriteWarnings: Determines whether to override Python's
- ``warnings.py`` module with a custom implementation (defaults to
- ``True``).
- """
- def __init__(self, stream, strict=True, warndest = None, overwriteWarnings = True):
- if overwriteWarnings:
- # have to dynamically override the default showwarning since there are no
- # public methods that specify the 'file' parameter
- def _showwarning(message, category, filename, lineno, file=warndest, line=None):
- if file is None:
- file = sys.stderr
- try:
- # It is possible for sys.stderr to be defined as None, most commonly in the case that the script
- # is being run vida pythonw.exe on Windows. In this case, just swallow the warning.
- # See also https://docs.python.org/3/library/sys.html#sys.__stderr__
- if file is not None:
- file.write(formatWarning(message, category, filename, lineno, line))
- except IOError:
- pass
- warnings.showwarning = _showwarning
- self.strict = strict
- self.flattenedPages = None
- self.resolvedObjects = {}
- self.xrefIndex = 0
- self._pageId2Num = None # map page IndirectRef number to Page Number
- if hasattr(stream, 'mode') and 'b' not in stream.mode:
- warnings.warn("PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", utils.PdfReadWarning)
- if isString(stream):
- fileobj = open(stream, 'rb')
- stream = BytesIO(b_(fileobj.read()))
- fileobj.close()
- self.read(stream)
- self.stream = stream
- self._override_encryption = False
- def getDocumentInfo(self):
- """
- Retrieves the PDF file's document information dictionary, if it exists.
- Note that some PDF files use metadata streams instead of docinfo
- dictionaries, and these metadata streams will not be accessed by this
- function.
- :return: the document information of this PDF file
- :rtype: :class:`DocumentInformation<pdf.DocumentInformation>` or ``None`` if none exists.
- """
- if "/Info" not in self.trailer:
- return None
- obj = self.trailer['/Info']
- retval = DocumentInformation()
- retval.update(obj)
- return retval
- documentInfo = property(lambda self: self.getDocumentInfo(), None, None)
- """Read-only property that accesses the :meth:`getDocumentInfo()<PdfFileReader.getDocumentInfo>` function."""
- def getXmpMetadata(self):
- """
- Retrieves XMP (Extensible Metadata Platform) data from the PDF document
- root.
- :return: a :class:`XmpInformation<xmp.XmpInformation>`
- instance that can be used to access XMP metadata from the document.
- :rtype: :class:`XmpInformation<xmp.XmpInformation>` or
- ``None`` if no metadata was found on the document root.
- """
- try:
- self._override_encryption = True
- return self.trailer["/Root"].getXmpMetadata()
- finally:
- self._override_encryption = False
- xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None)
- """
- Read-only property that accesses the
- :meth:`getXmpMetadata()<PdfFileReader.getXmpMetadata>` function.
- """
- def getNumPages(self):
- """
- Calculates the number of pages in this PDF file.
- :return: number of pages
- :rtype: int
- :raises PdfReadError: if file is encrypted and restrictions prevent
- this action.
- """
- # Flattened pages will not work on an Encrypted PDF;
- # the PDF file's page count is used in this case. Otherwise,
- # the original method (flattened page count) is used.
- if self.isEncrypted:
- try:
- self._override_encryption = True
- self.decrypt('')
- return self.trailer["/Root"]["/Pages"]["/Count"]
- except Exception:
- raise utils.PdfReadError("File has not been decrypted")
- finally:
- self._override_encryption = False
- else:
- if self.flattenedPages == None:
- self._flatten()
- return len(self.flattenedPages)
- numPages = property(lambda self: self.getNumPages(), None, None)
- """
- Read-only property that accesses the
- :meth:`getNumPages()<PdfFileReader.getNumPages>` function.
- """
- def getPage(self, pageNumber):
- """
- Retrieves a page by number from this PDF file.
- :param int pageNumber: The page number to retrieve
- (pages begin at zero)
- :return: a :class:`PageObject<pdf.PageObject>` instance.
- :rtype: :class:`PageObject<pdf.PageObject>`
- """
- ## ensure that we're not trying to access an encrypted PDF
- #assert not self.trailer.has_key("/Encrypt")
- if self.flattenedPages == None:
- self._flatten()
- return self.flattenedPages[pageNumber]
- namedDestinations = property(lambda self:
- self.getNamedDestinations(), None, None)
- """
- Read-only property that accesses the
- :meth:`getNamedDestinations()<PdfFileReader.getNamedDestinations>` function.
- """
- # A select group of relevant field attributes. For the complete list,
- # see section 8.6.2 of the PDF 1.7 reference.
- def getFields(self, tree = None, retval = None, fileobj = None):
- """
- Extracts field data if this PDF contains interactive form fields.
- The *tree* and *retval* parameters are for recursive use.
- :param fileobj: A file object (usually a text file) to write
- a report to on all interactive form fields found.
- :return: A dictionary where each key is a field name, and each
- value is a :class:`Field<PyPDF2.generic.Field>` object. By
- default, the mapping name is used for keys.
- :rtype: dict, or ``None`` if form data could not be located.
- """
- fieldAttributes = {"/FT" : "Field Type", "/Parent" : "Parent",
- "/T" : "Field Name", "/TU" : "Alternate Field Name",
- "/TM" : "Mapping Name", "/Ff" : "Field Flags",
- "/V" : "Value", "/DV" : "Default Value"}
- if retval == None:
- retval = {}
- catalog = self.trailer["/Root"]
- # get the AcroForm tree
- if "/AcroForm" in catalog:
- tree = catalog["/AcroForm"]
- else:
- return None
- if tree == None:
- return retval
- self._checkKids(tree, retval, fileobj)
- for attr in fieldAttributes:
- if attr in tree:
- # Tree is a field
- self._buildField(tree, retval, fileobj, fieldAttributes)
- break
- if "/Fields" in tree:
- fields = tree["/Fields"]
- for f in fields:
- field = f.getObject()
- self._buildField(field, retval, fileobj, fieldAttributes)
- return retval
- def _buildField(self, field, retval, fileobj, fieldAttributes):
- self._checkKids(field, retval, fileobj)
- try:
- key = field["/TM"]
- except KeyError:
- try:
- key = field["/T"]
- except KeyError:
- # Ignore no-name field for now
- return
- if fileobj:
- self._writeField(fileobj, field, fieldAttributes)
- fileobj.write("\n")
- retval[key] = Field(field)
- def _checkKids(self, tree, retval, fileobj):
- if "/Kids" in tree:
- # recurse down the tree
- for kid in tree["/Kids"]:
- self.getFields(kid.getObject(), retval, fileobj)
- def _writeField(self, fileobj, field, fieldAttributes):
- order = ["/TM", "/T", "/FT", "/Parent", "/TU", "/Ff", "/V", "/DV"]
- for attr in order:
- attrName = fieldAttributes[attr]
- try:
- if attr == "/FT":
- # Make the field type value more clear
- types = {"/Btn":"Button", "/Tx":"Text", "/Ch": "Choice",
- "/Sig":"Signature"}
- if field[attr] in types:
- fileobj.write(attrName + ": " + types[field[attr]] + "\n")
- elif attr == "/Parent":
- # Let's just write the name of the parent
- try:
- name = field["/Parent"]["/TM"]
- except KeyError:
- name = field["/Parent"]["/T"]
- fileobj.write(attrName + ": " + name + "\n")
- else:
- fileobj.write(attrName + ": " + str(field[attr]) + "\n")
- except KeyError:
- # Field attribute is N/A or unknown, so don't write anything
- pass
- def getFormTextFields(self):
- ''' Retrieves form fields from the document with textual data (inputs, dropdowns)
- '''
- # Retrieve document form fields
- formfields = self.getFields()
- return dict(
- (formfields[field]['/T'], formfields[field].get('/V')) for field in formfields \
- if formfields[field].get('/FT') == '/Tx'
- )
- def getNamedDestinations(self, tree=None, retval=None):
- """
- Retrieves the named destinations present in the document.
- :return: a dictionary which maps names to
- :class:`Destinations<PyPDF2.generic.Destination>`.
- :rtype: dict
- """
- if retval == None:
- retval = {}
- catalog = self.trailer["/Root"]
- # get the name tree
- if "/Dests" in catalog:
- tree = catalog["/Dests"]
- elif "/Names" in catalog:
- names = catalog['/Names']
- if "/Dests" in names:
- tree = names['/Dests']
- if tree == None:
- return retval
- if "/Kids" in tree:
- # recurse down the tree
- for kid in tree["/Kids"]:
- self.getNamedDestinations(kid.getObject(), retval)
- if "/Names" in tree:
- names = tree["/Names"]
- for i in range(0, len(names), 2):
- key = names[i].getObject()
- val = names[i+1].getObject()
- if isinstance(val, DictionaryObject) and '/D' in val:
- val = val['/D']
- dest = self._buildDestination(key, val)
- if dest != None:
- retval[key] = dest
- return retval
- outlines = property(lambda self: self.getOutlines(), None, None)
- """
- Read-only property that accesses the
- :meth:`getOutlines()<PdfFileReader.getOutlines>` function.
- """
- def getOutlines(self, node=None, outlines=None):
- """
- Retrieves the document outline present in the document.
- :return: a nested list of :class:`Destinations<PyPDF2.generic.Destination>`.
- """
- if outlines == None:
- outlines = []
- catalog = self.trailer["/Root"]
- # get the outline dictionary and named destinations
- if "/Outlines" in catalog:
- try:
- lines = catalog["/Outlines"]
- except utils.PdfReadError:
- # this occurs if the /Outlines object reference is incorrect
- # for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf
- # so continue to load the file without the Bookmarks
- return outlines
- if "/First" in lines:
- node = lines["/First"]
- self._namedDests = self.getNamedDestinations()
- if node == None:
- return outlines
- # see if there are any more outlines
- while True:
- outline = self._buildOutline(node)
- if outline:
- outlines.append(outline)
- # check for sub-outlines
- if "/First" in node:
- subOutlines = []
- self.getOutlines(node["/First"], subOutlines)
- if subOutlines:
- outlines.append(subOutlines)
- if "/Next" not in node:
- break
- node = node["/Next"]
- return outlines
- def _getPageNumberByIndirect(self, indirectRef):
- """Generate _pageId2Num"""
- if self._pageId2Num is None:
- id2num = {}
- for i, x in enumerate(self.pages):
- id2num[x.indirectRef.idnum] = i
- self._pageId2Num = id2num
- if isinstance(indirectRef, int):
- idnum = indirectRef
- else:
- idnum = indirectRef.idnum
- ret = self._pageId2Num.get(idnum, -1)
- return ret
- def getPageNumber(self, page):
- """
- Retrieve page number of a given PageObject
- :param PageObject page: The page to get page number. Should be
- an instance of :class:`PageObject<PyPDF2.pdf.PageObject>`
- :return: the page number or -1 if page not found
- :rtype: int
- """
- indirectRef = page.indirectRef
- ret = self._getPageNumberByIndirect(indirectRef)
- return ret
- def getDestinationPageNumber(self, destination):
- """
- Retrieve page number of a given Destination object
- :param Destination destination: The destination to get page number.
- Should be an instance of
- :class:`Destination<PyPDF2.pdf.Destination>`
- :return: the page number or -1 if page not found
- :rtype: int
- """
- indirectRef = destination.page
- ret = self._getPageNumberByIndirect(indirectRef)
- return ret
- def _buildDestination(self, title, array):
- page, typ = array[0:2]
- array = array[2:]
- return Destination(title, page, typ, *array)
- def _buildOutline(self, node):
- dest, title, outline = None, None, None
- if "/A" in node and "/Title" in node:
- # Action, section 8.5 (only type GoTo supported)
- title = node["/Title"]
- action = node["/A"]
- if action["/S"] == "/GoTo":
- dest = action["/D"]
- elif "/Dest" in node and "/Title" in node:
- # Destination, section 8.2.1
- title = node["/Title"]
- dest = node["/Dest"]
- # if destination found, then create outline
- if dest:
- if isinstance(dest, ArrayObject):
- outline = self._buildDestination(title, dest)
- elif isString(dest) and dest in self._namedDests:
- outline = self._namedDests[dest]
- outline[NameObject("/Title")] = title
- else:
- raise utils.PdfReadError("Unexpected destination %r" % dest)
- return outline
- pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage),
- None, None)
- """
- Read-only property that emulates a list based upon the
- :meth:`getNumPages()<PdfFileReader.getNumPages>` and
- :meth:`getPage()<PdfFileReader.getPage>` methods.
- """
- def getPageLayout(self):
- """
- Get the page layout.
- See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>`
- for a description of valid layouts.
- :return: Page layout currently being used.
- :rtype: ``str``, ``None`` if not specified
- """
- try:
- return self.trailer['/Root']['/PageLayout']
- except KeyError:
- return None
- pageLayout = property(getPageLayout)
- """Read-only property accessing the
- :meth:`getPageLayout()<PdfFileReader.getPageLayout>` method."""
- def getPageMode(self):
- """
- Get the page mode.
- See :meth:`setPageMode()<PdfFileWriter.setPageMode>`
- for a description of valid modes.
- :return: Page mode currently being used.
- :rtype: ``str``, ``None`` if not specified
- """
- try:
- return self.trailer['/Root']['/PageMode']
- except KeyError:
- return None
- pageMode = property(getPageMode)
- """Read-only property accessing the
- :meth:`getPageMode()<PdfFileReader.getPageMode>` method."""
- def _flatten(self, pages=None, inherit=None, indirectRef=None):
- inheritablePageAttributes = (
- NameObject("/Resources"), NameObject("/MediaBox"),
- NameObject("/CropBox"), NameObject("/Rotate")
- )
- if inherit == None:
- inherit = dict()
- if pages == None:
- self.flattenedPages = []
- catalog = self.trailer["/Root"].getObject()
- pages = catalog["/Pages"].getObject()
- t = "/Pages"
- if "/Type" in pages:
- t = pages["/Type"]
- if t == "/Pages":
- for attr in inheritablePageAttributes:
- if attr in pages:
- inherit[attr] = pages[attr]
- for page in pages["/Kids"]:
- addt = {}
- if isinstance(page, IndirectObject):
- addt["indirectRef"] = page
- self._flatten(page.getObject(), inherit, **addt)
- elif t == "/Page":
- for attr, value in list(inherit.items()):
- # if the page has it's own value, it does not inherit the
- # parent's value:
- if attr not in pages:
- pages[attr] = value
- pageObj = PageObject(self, indirectRef)
- pageObj.update(pages)
- self.flattenedPages.append(pageObj)
- def _getObjectFromStream(self, indirectReference):
- # indirect reference to object in object stream
- # read the entire object stream into memory
- debug = False
- stmnum, idx = self.xref_objStm[indirectReference.idnum]
- if debug: print(("Here1: %s %s"%(stmnum, idx)))
- objStm = IndirectObject(stmnum, 0, self).getObject()
- if debug: print(("Here2: objStm=%s.. stmnum=%s data=%s"%(objStm, stmnum, objStm.getData())))
- # This is an xref to a stream, so its type better be a stream
- assert objStm['/Type'] == '/ObjStm'
- # /N is the number of indirect objects in the stream
- assert idx < objStm['/N']
- streamData = BytesIO(b_(objStm.getData()))
- for i in range(objStm['/N']):
- readNonWhitespace(streamData)
- streamData.seek(-1, 1)
- objnum = NumberObject.readFromStream(streamData)
- readNonWhitespace(streamData)
- streamData.seek(-1, 1)
- offset = NumberObject.readFromStream(streamData)
- readNonWhitespace(streamData)
- streamData.seek(-1, 1)
- if objnum != indirectReference.idnum:
- # We're only interested in one object
- continue
- if self.strict and idx != i:
- raise utils.PdfReadError("Object is in wrong index.")
- streamData.seek(objStm['/First']+offset, 0)
- if debug:
- pos = streamData.tell()
- streamData.seek(0, 0)
- lines = streamData.readlines()
- for i in range(0, len(lines)):
- print((lines[i]))
- streamData.seek(pos, 0)
- try:
- obj = readObject(streamData, self)
- except utils.PdfStreamError as e:
- # Stream object cannot be read. Normally, a critical error, but
- # Adobe Reader doesn't complain, so continue (in strict mode?)
- e = sys.exc_info()[1]
- warnings.warn("Invalid stream (index %d) within object %d %d: %s" % \
- (i, indirectReference.idnum, indirectReference.generation, e), utils.PdfReadWarning)
- if self.strict:
- raise utils.PdfReadError("Can't read object stream: %s" % e)
- # Replace with null. Hopefully it's nothing important.
- obj = NullObject()
- return obj
- if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.")
- return NullObject()
- def getObject(self, indirectReference):
- debug = False
- if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation))
- retval = self.cacheGetIndirectObject(indirectReference.generation,
- indirectReference.idnum)
- if retval != None:
- return retval
- if indirectReference.generation == 0 and \
- indirectReference.idnum in self.xref_objStm:
- retval = self._getObjectFromStream(indirectReference)
- elif indirectReference.generation in self.xref and \
- indirectReference.idnum in self.xref[indirectReference.generation]:
- start = self.xref[indirectReference.generation][indirectReference.idnum]
- if debug: print((" Uncompressed Object", indirectReference.idnum, indirectReference.generation, ":", start))
- self.stream.seek(start, 0)
- idnum, generation = self.readObjectHeader(self.stream)
- if idnum != indirectReference.idnum and self.xrefIndex:
- # Xref table probably had bad indexes due to not being zero-indexed
- if self.strict:
- raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." \
- % (indirectReference.idnum, indirectReference.generation, idnum, generation))
- else: pass # xref table is corrected in non-strict mode
- elif idnum != indirectReference.idnum and self.strict:
- # some other problem
- raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d)." \
- % (indirectReference.idnum, indirectReference.generation, idnum, generation))
- if self.strict:
- assert generation == indirectReference.generation
- retval = readObject(self.stream, self)
- # override encryption is used for the /Encrypt dictionary
- if not self._override_encryption and self.isEncrypted:
- # if we don't have the encryption key:
- if not hasattr(self, '_decryption_key'):
- raise utils.PdfReadError("file has not been decrypted")
- # otherwise, decrypt here...
- import struct
- pack1 = struct.pack("<i", indirectReference.idnum)[:3]
- pack2 = struct.pack("<i", indirectReference.generation)[:2]
- key = self._decryption_key + pack1 + pack2
- assert len(key) == (len(self._decryption_key) + 5)
- md5_hash = md5(key).digest()
- key = md5_hash[:min(16, len(self._decryption_key) + 5)]
- retval = self._decryptObject(retval, key)
- else:
- warnings.warn("Object %d %d not defined." % (indirectReference.idnum,
- indirectReference.generation), utils.PdfReadWarning)
- #if self.strict:
- raise utils.PdfReadError("Could not find object.")
- self.cacheIndirectObject(indirectReference.generation,
- indirectReference.idnum, retval)
- return retval
- def _decryptObject(self, obj, key):
- if isinstance(obj, ByteStringObject) or isinstance(obj, TextStringObject):
- obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes))
- elif isinstance(obj, StreamObject):
- obj._data = utils.RC4_encrypt(key, obj._data)
- elif isinstance(obj, DictionaryObject):
- for dictkey, value in list(obj.items()):
- obj[dictkey] = self._decryptObject(value, key)
- elif isinstance(obj, ArrayObject):
- for i in range(len(obj)):
- obj[i] = self._decryptObject(obj[i], key)
- return obj
- def readObjectHeader(self, stream):
- # Should never be necessary to read out whitespace, since the
- # cross-reference table should put us in the right spot to read the
- # object header. In reality... some files have stupid cross reference
- # tables that are off by whitespace bytes.
- extra = False
- utils.skipOverComment(stream)
- extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1)
- idnum = readUntilWhitespace(stream)
- extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1)
- generation = readUntilWhitespace(stream)
- obj = stream.read(3)
- readNonWhitespace(stream)
- stream.seek(-1, 1)
- if (extra and self.strict):
- #not a fatal error
- warnings.warn("Superfluous whitespace found in object header %s %s" % \
- (idnum, generation), utils.PdfReadWarning)
- return int(idnum), int(generation)
- def cacheGetIndirectObject(self, generation, idnum):
- debug = False
- out = self.resolvedObjects.get((generation, idnum))
- if debug and out: print(("cache hit: %d %d"%(idnum, generation)))
- elif debug: print(("cache miss: %d %d"%(idnum, generation)))
- return out
- def cacheIndirectObject(self, generation, idnum, obj):
- # return None # Sometimes we want to turn off cache for debugging.
- if (generation, idnum) in self.resolvedObjects:
- msg = "Overwriting cache for %s %s"%(generation, idnum)
- if self.strict: raise utils.PdfReadError(msg)
- else: warnings.warn(msg)
- self.resolvedObjects[(generation, idnum)] = obj
- return obj
- def read(self, stream):
- debug = False
- if debug: print(">>read", stream)
- # start at the end:
- stream.seek(-1, 2)
- if not stream.tell():
- raise utils.PdfReadError('Cannot read an empty file')
- last1K = stream.tell() - 1024 + 1 # offset of last 1024 bytes of stream
- line = b_('')
- while line[:5] != b_("%%EOF"):
- if stream.tell() < last1K:
- raise utils.PdfReadError("EOF marker not found")
- line = self.readNextEndLine(stream)
- if debug: print(" line:",line)
- # find startxref entry - the location of the xref table
- line = self.readNextEndLine(stream)
- try:
- startxref = int(line)
- except ValueError:
- # 'startxref' may be on the same line as the location
- if not line.startswith(b_("startxref")):
- raise utils.PdfReadError("startxref not found")
- startxref = int(line[9:].strip())
- warnings.warn("startxref on same line as offset")
- else:
- line = self.readNextEndLine(stream)
- if line[:9] != b_("startxref"):
- raise utils.PdfReadError("startxref not found")
- # read all cross reference tables and their trailers
- self.xref = {}
- self.xref_objStm = {}
- self.trailer = DictionaryObject()
- while True:
- # load the xref table
- stream.seek(startxref, 0)
- x = stream.read(1)
- if x == b_("x"):
- # standard cross-reference table
- ref = stream.read(4)
- if ref[:3] != b_("ref"):
- raise utils.PdfReadError("xref table read error")
- readNonWhitespace(stream)
- stream.seek(-1, 1)
- firsttime = True; # check if the first time looking at the xref table
- while True:
- num = readObject(stream, self)
- if firsttime and num != 0:
- self.xrefIndex = num
- if self.strict:
- warnings.warn("Xref table not zero-indexed. ID numbers for objects will be corrected.", utils.PdfReadWarning)
- #if table not zero indexed, could be due to error from when PDF was created
- #which will lead to mismatched indices later on, only warned and corrected if self.strict=True
- firsttime = False
- readNonWhitespace(stream)
- stream.seek(-1, 1)
- size = readObject(stream, self)
- readNonWhitespace(stream)
- stream.seek(-1, 1)
- cnt = 0
- while cnt < size:
- line = stream.read(20)
- # It's very clear in section 3.4.3 of the PDF spec
- # that all cross-reference table lines are a fixed
- # 20 bytes (as of PDF 1.7). However, some files have
- # 21-byte entries (or more) due to the use of \r\n
- # (CRLF) EOL's. Detect that case, and adjust the line
- # until it does not begin with a \r (CR) or \n (LF).
- while line[0] in b_("\x0D\x0A"):
- stream.seek(-20 + 1, 1)
- line = stream.read(20)
- # On the other hand, some malformed PDF files
- # use a single character EOL without a preceeding
- # space. Detect that case, and seek the stream
- # back one character. (0-9 means we've bled into
- # the next xref entry, t means we've bled into the
- # text "trailer"):
- if line[-1] in b_("0123456789t"):
- stream.seek(-1, 1)
- offset, generation = line[:16].split(b_(" "))
- offset, generation = int(offset), int(generation)
- if generation not in self.xref:
- self.xref[generation] = {}
- if num in self.xref[generation]:
- # It really seems like we should allow the last
- # xref table in the file to override previous
- # ones. Since we read the file backwards, assume
- # any existing key is already set correctly.
- pass
- else:
- self.xref[generation][num] = offset
- cnt += 1
- num += 1
- readNonWhitespace(stream)
- stream.seek(-1, 1)
- trailertag = stream.read(7)
- if trailertag != b_("trailer"):
- # more xrefs!
- stream.seek(-7, 1)
- else:
- break
- readNonWhitespace(stream)
- stream.seek(-1, 1)
- newTrailer = readObject(stream, self)
- for key, value in list(newTrailer.items()):
- if key not in self.trailer:
- self.trailer[key] = value
- if "/Prev" in newTrailer:
- startxref = newTrailer["/Prev"]
- else:
- break
- elif x.isdigit():
- # PDF 1.5+ Cross-Reference Stream
- stream.seek(-1, 1)
- idnum, generation = self.readObjectHeader(stream)
- xrefstream = readObject(stream, self)
- assert xrefstream["/Type"] == "/XRef"
- self.cacheIndirectObject(generation, idnum, xrefstream)
- streamData = BytesIO(b_(xrefstream.getData()))
- # Index pairs specify the subsections in the dictionary. If
- # none create one subsection that spans everything.
- idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
- if debug: print(("read idx_pairs=%s"%list(self._pairs(idx_pairs))))
- entrySizes = xrefstream.get("/W")
- assert len(entrySizes) >= 3
- if self.strict and len(entrySizes) > 3:
- raise utils.PdfReadError("Too many entry sizes: %s" % entrySizes)
- def getEntry(i):
- # Reads the correct number of bytes for each entry. See the
- # discussion of the W parameter in PDF spec table 17.
- if entrySizes[i] > 0:
- d = streamData.read(entrySizes[i])
- return convertToInt(d, entrySizes[i])
- # PDF Spec Table 17: A value of zero for an element in the
- # W array indicates...the default value shall be used
- if i == 0: return 1 # First value defaults to 1
- else: return 0
- def used_before(num, generation):
- # We move backwards through the xrefs, don't replace any.
- return num in self.xref.get(generation, []) or \
- num in self.xref_objStm
- # Iterate through each subsection
- last_end = 0
- for start, size in self._pairs(idx_pairs):
- # The subsections must increase
- assert start >= last_end
- last_end = start + size
- for num in range(start, start+size):
- # The first entry is the type
- xref_type = getEntry(0)
- # The rest of the elements depend on the xref_type
- if xref_type == 0:
- # linked list of free objects
- next_free_object = getEntry(1)
- next_generation = getEntry(2)
- elif xref_type == 1:
- # objects that are in use but are not compressed
- byte_offset = getEntry(1)
- generation = getEntry(2)
- if generation not in self.xref:
- self.xref[generation] = {}
- if not used_before(num, generation):
- self.xref[generation][num] = byte_offset
- if debug: print(("XREF Uncompressed: %s %s"%(
- num, generation)))
- elif xref_type == 2:
- # compressed objects
- objstr_num = getEntry(1)
- obstr_idx = getEntry(2)
- generation = 0 # PDF spec table 18, generation is 0
- if not used_before(num, generation):
- if debug: print(("XREF Compressed: %s %s %s"%(
- num, objstr_num, obstr_idx)))
- self.xref_objStm[num] = (objstr_num, obstr_idx)
- elif self.strict:
- raise utils.PdfReadError("Unknown xref type: %s" %
- xref_type)
- trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
- for key in trailerKeys:
- if key in xrefstream and key not in self.trailer:
- self.trailer[NameObject(key)] = xrefstream.raw_get(key)
- if "/Prev" in xrefstream:
- startxref = xrefstream["/Prev"]
- else:
- break
- else:
- # bad xref character at startxref. Let's see if we can find
- # the xref table nearby, as we've observed this error with an
- # off-by-one before.
- stream.seek(-11, 1)
- tmp = stream.read(20)
- xref_loc = tmp.find(b_("xref"))
- if xref_loc != -1:
- startxref -= (10 - xref_loc)
- continue
- # No explicit xref table, try finding a cross-reference stream.
- stream.seek(startxref, 0)
- found = False
- for look in range(5):
- if stream.read(1).isdigit():
- # This is not a standard PDF, consider adding a warning
- startxref += look
- found = True
- break
- if found:
- continue
- # no xref table found at specified location
- raise utils.PdfReadError("Could not find xref table at specified location")
- #if not zero-indexed, verify that the table is correct; change it if necessary
- if self.xrefIndex and not self.strict:
- loc = stream.tell()
- for gen in self.xref:
- if gen == 65535: continue
- for id in self.xref[gen]:
- stream.seek(self.xref[gen][id], 0)
- try:
- pid, pgen = self.readObjectHeader(stream)
- except ValueError:
- break
- if pid == id - self.xrefIndex:
- self._zeroXref(gen)
- break
- #if not, then either it's just plain wrong, or the non-zero-index is actually correct
- stream.seek(loc, 0) #return to where it was
- def _zeroXref(self, generation):
- self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) )
- def _pairs(self, array):
- i = 0
- while True:
- yield array[i], array[i+1]
- i += 2
- if (i+1) >= len(array):
- break
- def readNextEndLine(self, stream):
- debug = False
- if debug: print(">>readNextEndLine")
- line = b_("")
- while True:
- # Prevent infinite loops in malformed PDFs
- if stream.tell() == 0:
- raise utils.PdfReadError("Could not read malformed PDF file")
- x = stream.read(1)
- if debug: print((" x:", x, "%x"%ord(x)))
- if stream.tell() < 2:
- raise utils.PdfReadError("EOL marker not found")
- stream.seek(-2, 1)
- if x == b_('\n') or x == b_('\r'): ## \n = LF; \r = CR
- crlf = False
- while x == b_('\n') or x == b_('\r'):
- if debug:
- if ord(x) == 0x0D: print(" x is CR 0D")
- elif ord(x) == 0x0A: print(" x is LF 0A")
- x = stream.read(1)
- if x == b_('\n') or x == b_('\r'): # account for CR+LF
- stream.seek(-1, 1)
- crlf = True
- if stream.tell() < 2:
- raise utils.PdfReadError("EOL marker not found")
- stream.seek(-2, 1)
- stream.seek(2 if crlf else 1, 1) #if using CR+LF, go back 2 bytes, else 1
- break
- else:
- if debug: print(" x is neither")
- line = x + line
- if debug: print((" RNEL line:", line))
- if debug: print("leaving RNEL")
- return line
- def decrypt(self, password):
- """
- When using an encrypted / secured PDF file with the PDF Standard
- encryption handler, this function will allow the file to be decrypted.
- It checks the given password against the document's user password and
- owner password, and then stores the resulting decryption key if either
- password is correct.
- It does not matter which password was matched. Both passwords provide
- the correct decryption key that will allow the document to be used with
- this library.
- :param str password: The password to match.
- :return: ``0`` if the password failed, ``1`` if the password matched the user
- password, and ``2`` if the password matched the owner password.
- :rtype: int
- :raises NotImplementedError: if document uses an unsupported encryption
- method.
- """
- self._override_encryption = True
- try:
- return self._decrypt(password)
- finally:
- self._override_encryption = False
- def _decrypt(self, password):
- encrypt = self.trailer['/Encrypt'].getObject()
- if encrypt['/Filter'] != '/Standard':
- raise NotImplementedError("only Standard PDF encryption handler is available")
- if not (encrypt['/V'] in (1, 2)):
- raise NotImplementedError("only algorithm code 1 and 2 are supported. This PDF uses code %s" % encrypt['/V'])
- user_password, key = self._authenticateUserPassword(password)
- if user_password:
- self._decryption_key = key
- return 1
- else:
- rev = encrypt['/R'].getObject()
- if rev == 2:
- keylen = 5
- else:
- keylen = encrypt['/Length'].getObject() // 8
- key = _alg33_1(password, rev, keylen)
- real_O = encrypt["/O"].getObject()
- if rev == 2:
- userpass = utils.RC4_encrypt(key, real_O)
- else:
- val = real_O
- for i in range(19, -1, -1):
- new_key = b_('')
- for l in range(len(key)):
- new_key += b_(chr(utils.ord_(key[l]) ^ i))
- val = utils.RC4_encrypt(new_key, val)
- userpass = val
- owner_password, key = self._authenticateUserPassword(userpass)
- if owner_password:
- self._decryption_key = key
- return 2
- return 0
- def _authenticateUserPassword(self, password):
- encrypt = self.trailer['/Encrypt'].getObject()
- rev = encrypt['/R'].getObject()
- owner_entry = encrypt['/O'].getObject()
- p_entry = encrypt['/P'].getObject()
- id_entry = self.trailer['/ID'].getObject()
- id1_entry = id_entry[0].getObject()
- real_U = encrypt['/U'].getObject().original_bytes
- if rev == 2:
- U, key = _alg34(password, owner_entry, p_entry, id1_entry)
- elif rev >= 3:
- U, key = _alg35(password, rev,
- encrypt["/Length"].getObject() // 8, owner_entry,
- p_entry, id1_entry,
- encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject())
- U, real_U = U[:16], real_U[:16]
- return U == real_U, key
- def getIsEncrypted(self):
- return "/Encrypt" in self.trailer
- isEncrypted = property(lambda self: self.getIsEncrypted(), None, None)
- """
- Read-only boolean property showing whether this PDF file is encrypted.
- Note that this property, if true, will remain true even after the
- :meth:`decrypt()<PdfFileReader.decrypt>` method is called.
- """
- def getRectangle(self, name, defaults):
- retval = self.get(name)
- if isinstance(retval, RectangleObject):
- return retval
- if retval == None:
- for d in defaults:
- retval = self.get(d)
- if retval != None:
- break
- if isinstance(retval, IndirectObject):
- retval = self.pdf.getObject(retval)
- retval = RectangleObject(retval)
- setRectangle(self, name, retval)
- return retval
- def setRectangle(self, name, value):
- if not isinstance(name, NameObject):
- name = NameObject(name)
- self[name] = value
- def deleteRectangle(self, name):
- del self[name]
- def createRectangleAccessor(name, fallback):
- return \
- property(
- lambda self: getRectangle(self, name, fallback),
- lambda self, value: setRectangle(self, name, value),
- lambda self: deleteRectangle(self, name)
- )
- class PageObject(DictionaryObject):
- """
- This class represents a single page within a PDF file. Typically this
- object will be created by accessing the
- :meth:`getPage()<PyPDF2.PdfFileReader.getPage>` method of the
- :class:`PdfFileReader<PyPDF2.PdfFileReader>` class, but it is
- also possible to create an empty page with the
- :meth:`createBlankPage()<PageObject.createBlankPage>` static method.
- :param pdf: PDF file the page belongs to.
- :param indirectRef: Stores the original indirect reference to
- this object in its source PDF
- """
- def __init__(self, pdf=None, indirectRef=None):
- DictionaryObject.__init__(self)
- self.pdf = pdf
- self.indirectRef = indirectRef
- def createBlankPage(pdf=None, width=None, height=None):
- """
- Returns a new blank page.
- If ``width`` or ``height`` is ``None``, try to get the page size
- from the last page of *pdf*.
- :param pdf: PDF file the page belongs to
- :param float width: The width of the new page expressed in default user
- space units.
- :param float height: The height of the new page expressed in default user
- space units.
- :return: the new blank page:
- :rtype: :class:`PageObject<PageObject>`
- :raises PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains
- no page
- """
- page = PageObject(pdf)
- # Creates a new page (cf PDF Reference 7.7.3.3)
- page.__setitem__(NameObject('/Type'), NameObject('/Page'))
- page.__setitem__(NameObject('/Parent'), NullObject())
- page.__setitem__(NameObject('/Resources'), DictionaryObject())
- if width is None or height is None:
- if pdf is not None and pdf.getNumPages() > 0:
- lastpage = pdf.getPage(pdf.getNumPages() - 1)
- width = lastpage.mediaBox.getWidth()
- height = lastpage.mediaBox.getHeight()
- else:
- raise utils.PageSizeNotDefinedError()
- page.__setitem__(NameObject('/MediaBox'),
- RectangleObject([0, 0, width, height]))
- return page
- createBlankPage = staticmethod(createBlankPage)
- def rotateClockwise(self, angle):
- """
- Rotates a page clockwise by increments of 90 degrees.
- :param int angle: Angle to rotate the page. Must be an increment
- of 90 deg.
- """
- assert angle % 90 == 0
- self._rotate(angle)
- return self
- def rotateCounterClockwise(self, angle):
- """
- Rotates a page counter-clockwise by increments of 90 degrees.
- :param int angle: Angle to rotate the page. Must be an increment
- of 90 deg.
- """
- assert angle % 90 == 0
- self._rotate(-angle)
- return self
- def _rotate(self, angle):
- rotateObj = self.get("/Rotate", 0)
- currentAngle = rotateObj if isinstance(rotateObj, int) else rotateObj.getObject()
- self[NameObject("/Rotate")] = NumberObject(currentAngle + angle)
- def _mergeResources(res1, res2, resource):
- newRes = DictionaryObject()
- newRes.update(res1.get(resource, DictionaryObject()).getObject())
- page2Res = res2.get(resource, DictionaryObject()).getObject()
- renameRes = {}
- for key in list(page2Res.keys()):
- if key in newRes and newRes.raw_get(key) != page2Res.raw_get(key):
- newname = NameObject(key + str(uuid.uuid4()))
- renameRes[key] = newname
- newRes[newname] = page2Res[key]
- elif key not in newRes:
- newRes[key] = page2Res.raw_get(key)
- return newRes, renameRes
- _mergeResources = staticmethod(_mergeResources)
- def _contentStreamRename(stream, rename, pdf):
- if not rename:
- return stream
- stream = ContentStream(stream, pdf)
- for operands, _operator in stream.operations:
- for i in range(len(operands)):
- op = operands[i]
- if isinstance(op, NameObject):
- operands[i] = rename.get(op,op)
- return stream
- _contentStreamRename = staticmethod(_contentStreamRename)
- def _pushPopGS(contents, pdf):
- # adds a graphics state "push" and "pop" to the beginning and end
- # of a content stream. This isolates it from changes such as
- # transformation matricies.
- stream = ContentStream(contents, pdf)
- stream.operations.insert(0, [[], "q"])
- stream.operations.append([[], "Q"])
- return stream
- _pushPopGS = staticmethod(_pushPopGS)
- def _addTransformationMatrix(contents, pdf, ctm):
- # adds transformation matrix at the beginning of the given
- # contents stream.
- a, b, c, d, e, f = ctm
- contents = ContentStream(contents, pdf)
- contents.operations.insert(0, [[FloatObject(a), FloatObject(b),
- FloatObject(c), FloatObject(d), FloatObject(e),
- FloatObject(f)], " cm"])
- return contents
- _addTransformationMatrix = staticmethod(_addTransformationMatrix)
- def getContents(self):
- """
- Accesses the page contents.
- :return: the ``/Contents`` object, or ``None`` if it doesn't exist.
- ``/Contents`` is optional, as described in PDF Reference 7.7.3.3
- """
- if "/Contents" in self:
- return self["/Contents"].getObject()
- else:
- return None
- def mergePage(self, page2):
- """
- Merges the content streams of two pages into one. Resource references
- (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc
- of this page are not altered. The parameter page's content stream will
- be added to the end of this page's content stream, meaning that it will
- be drawn after, or "on top" of this page.
- :param PageObject page2: The page to be merged into this one. Should be
- an instance of :class:`PageObject<PageObject>`.
- """
- self._mergePage(page2)
- def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False):
- # First we work on merging the resource dictionaries. This allows us
- # to find out what symbols in the content streams we might need to
- # rename.
- newResources = DictionaryObject()
- rename = {}
- originalResources = self["/Resources"].getObject()
- page2Resources = page2["/Resources"].getObject()
- newAnnots = ArrayObject()
- for page in (self, page2):
- if "/Annots" in page:
- annots = page["/Annots"]
- if isinstance(annots, ArrayObject):
- for ref in annots:
- newAnnots.append(ref)
- for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties":
- new, newrename = PageObject._mergeResources(originalResources, page2Resources, res)
- if new:
- newResources[NameObject(res)] = new
- rename.update(newrename)
- # Combine /ProcSet sets.
- newResources[NameObject("/ProcSet")] = ArrayObject(
- frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union(
- frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject())
- )
- )
- newContentArray = ArrayObject()
- originalContent = self.getContents()
- if originalContent is not None:
- newContentArray.append(PageObject._pushPopGS(
- originalContent, self.pdf))
- page2Content = page2.getContents()
- if page2Content is not None:
- if page2transformation is not None:
- page2Content = page2transformation(page2Content)
- page2Content = PageObject._contentStreamRename(
- page2Content, rename, self.pdf)
- page2Content = PageObject._pushPopGS(page2Content, self.pdf)
- newContentArray.append(page2Content)
- # if expanding the page to fit a new page, calculate the new media box size
- if expand:
- corners1 = [self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(),
- self.mediaBox.getUpperRight_x().as_numeric(), self.mediaBox.getUpperRight_y().as_numeric()]
- corners2 = [page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(),
- page2.mediaBox.getUpperLeft_x().as_numeric(), page2.mediaBox.getUpperLeft_y().as_numeric(),
- page2.mediaBox.getUpperRight_x().as_numeric(), page2.mediaBox.getUpperRight_y().as_numeric(),
- page2.mediaBox.getLowerRight_x().as_numeric(), page2.mediaBox.getLowerRight_y().as_numeric()]
- if ctm is not None:
- ctm = [float(x) for x in ctm]
- new_x = [ctm[0]*corners2[i] + ctm[2]*corners2[i+1] + ctm[4] for i in range(0, 8, 2)]
- new_y = [ctm[1]*corners2[i] + ctm[3]*corners2[i+1] + ctm[5] for i in range(0, 8, 2)]
- else:
- new_x = corners2[0:8:2]
- new_y = corners2[1:8:2]
- lowerleft = [min(new_x), min(new_y)]
- upperright = [max(new_x), max(new_y)]
- lowerleft = [min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])]
- upperright = [max(corners1[2], upperright[0]), max(corners1[3], upperright[1])]
- self.mediaBox.setLowerLeft(lowerleft)
- self.mediaBox.setUpperRight(upperright)
- self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf)
- self[NameObject('/Resources')] = newResources
- self[NameObject('/Annots')] = newAnnots
- def mergeTransformedPage(self, page2, ctm, expand=False):
- """
- This is similar to mergePage, but a transformation matrix is
- applied to the merged stream.
- :param PageObject page2: The page to be merged into this one. Should be
- an instance of :class:`PageObject<PageObject>`.
- :param tuple ctm: a 6-element tuple containing the operands of the
- transformation matrix
- :param bool expand: Whether the page should be expanded to fit the dimensions
- of the page to be merged.
- """
- self._mergePage(page2, lambda page2Content:
- PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm), ctm, expand)
- def mergeScaledPage(self, page2, scale, expand=False):
- """
- This is similar to mergePage, but the stream to be merged is scaled
- by appling a transformation matrix.
- :param PageObject page2: The page to be merged into this one. Should be
- an instance of :class:`PageObject<PageObject>`.
- :param float scale: The scaling factor
- :param bool expand: Whether the page should be expanded to fit the
- dimensions of the page to be merged.
- """
- # CTM to scale : [ sx 0 0 sy 0 0 ]
- return self.mergeTransformedPage(page2, [scale, 0,
- 0, scale,
- 0, 0], expand)
- def mergeRotatedPage(self, page2, rotation, expand=False):
- """
- This is similar to mergePage, but the stream to be merged is rotated
- by appling a transformation matrix.
- :param PageObject page2: the page to be merged into this one. Should be
- an instance of :class:`PageObject<PageObject>`.
- :param float rotation: The angle of the rotation, in degrees
- :param bool expand: Whether the page should be expanded to fit the
- dimensions of the page to be merged.
- """
- rotation = math.radians(rotation)
- return self.mergeTransformedPage(page2,
- [math.cos(rotation), math.sin(rotation),
- -math.sin(rotation), math.cos(rotation),
- 0, 0], expand)
- def mergeTranslatedPage(self, page2, tx, ty, expand=False):
- """
- This is similar to mergePage, but the stream to be merged is translated
- by appling a transformation matrix.
- :param PageObject page2: the page to be merged into this one. Should be
- an instance of :class:`PageObject<PageObject>`.
- :param float tx: The translation on X axis
- :param float ty: The translation on Y axis
- :param bool expand: Whether the page should be expanded to fit the
- dimensions of the page to be merged.
- """
- return self.mergeTransformedPage(page2, [1, 0,
- 0, 1,
- tx, ty], expand)
- def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False):
- """
- This is similar to mergePage, but the stream to be merged is rotated
- and translated by appling a transformation matrix.
- :param PageObject page2: the page to be merged into this one. Should be
- an instance of :class:`PageObject<PageObject>`.
- :param float tx: The translation on X axis
- :param float ty: The translation on Y axis
- :param float rotation: The angle of the rotation, in degrees
- :param bool expand: Whether the page should be expanded to fit the
- dimensions of the page to be merged.
- """
- translation = [[1, 0, 0],
- [0, 1, 0],
- [-tx, -ty, 1]]
- rotation = math.radians(rotation)
- rotating = [[math.cos(rotation), math.sin(rotation), 0],
- [-math.sin(rotation), math.cos(rotation), 0],
- [0, 0, 1]]
- rtranslation = [[1, 0, 0],
- [0, 1, 0],
- [tx, ty, 1]]
- ctm = utils.matrixMultiply(translation, rotating)
- ctm = utils.matrixMultiply(ctm, rtranslation)
- return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
- ctm[1][0], ctm[1][1],
- ctm[2][0], ctm[2][1]], expand)
- def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False):
- """
- This is similar to mergePage, but the stream to be merged is rotated
- and scaled by appling a transformation matrix.
- :param PageObject page2: the page to be merged into this one. Should be
- an instance of :class:`PageObject<PageObject>`.
- :param float rotation: The angle of the rotation, in degrees
- :param float scale: The scaling factor
- :param bool expand: Whether the page should be expanded to fit the
- dimensions of the page to be merged.
- """
- rotation = math.radians(rotation)
- rotating = [[math.cos(rotation), math.sin(rotation), 0],
- [-math.sin(rotation), math.cos(rotation), 0],
- [0, 0, 1]]
- scaling = [[scale, 0, 0],
- [0, scale, 0],
- [0, 0, 1]]
- ctm = utils.matrixMultiply(rotating, scaling)
- return self.mergeTransformedPage(page2,
- [ctm[0][0], ctm[0][1],
- ctm[1][0], ctm[1][1],
- ctm[2][0], ctm[2][1]], expand)
- def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False):
- """
- This is similar to mergePage, but the stream to be merged is translated
- and scaled by appling a transformation matrix.
- :param PageObject page2: the page to be merged into this one. Should be
- an instance of :class:`PageObject<PageObject>`.
- :param float scale: The scaling factor
- :param float tx: The translation on X axis
- :param float ty: The translation on Y axis
- :param bool expand: Whether the page should be expanded to fit the
- dimensions of the page to be merged.
- """
- translation = [[1, 0, 0],
- [0, 1, 0],
- [tx, ty, 1]]
- scaling = [[scale, 0, 0],
- [0, scale, 0],
- [0, 0, 1]]
- ctm = utils.matrixMultiply(scaling, translation)
- return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
- ctm[1][0], ctm[1][1],
- ctm[2][0], ctm[2][1]], expand)
- def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expand=False):
- """
- This is similar to mergePage, but the stream to be merged is translated,
- rotated and scaled by appling a transformation matrix.
- :param PageObject page2: the page to be merged into this one. Should be
- an instance of :class:`PageObject<PageObject>`.
- :param float tx: The translation on X axis
- :param float ty: The translation on Y axis
- :param float rotation: The angle of the rotation, in degrees
- :param float scale: The scaling factor
- :param bool expand: Whether the page should be expanded to fit the
- dimensions of the page to be merged.
- """
- translation = [[1, 0, 0],
- [0, 1, 0],
- [tx, ty, 1]]
- rotation = math.radians(rotation)
- rotating = [[math.cos(rotation), math.sin(rotation), 0],
- [-math.sin(rotation), math.cos(rotation), 0],
- [0, 0, 1]]
- scaling = [[scale, 0, 0],
- [0, scale, 0],
- [0, 0, 1]]
- ctm = utils.matrixMultiply(rotating, scaling)
- ctm = utils.matrixMultiply(ctm, translation)
- return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
- ctm[1][0], ctm[1][1],
- ctm[2][0], ctm[2][1]], expand)
- ##
- # Applys a transformation matrix the page.
- #
- # @param ctm A 6 elements tuple containing the operands of the
- # transformation matrix
- def addTransformation(self, ctm):
- """
- Applies a transformation matrix to the page.
- :param tuple ctm: A 6-element tuple containing the operands of the
- transformation matrix.
- """
- originalContent = self.getContents()
- if originalContent is not None:
- newContent = PageObject._addTransformationMatrix(
- originalContent, self.pdf, ctm)
- newContent = PageObject._pushPopGS(newContent, self.pdf)
- self[NameObject('/Contents')] = newContent
- def scale(self, sx, sy):
- """
- Scales a page by the given factors by appling a transformation
- matrix to its content and updating the page size.
- :param float sx: The scaling factor on horizontal axis.
- :param float sy: The scaling factor on vertical axis.
- """
- self.addTransformation([sx, 0,
- 0, sy,
- 0, 0])
- self.mediaBox = RectangleObject([
- float(self.mediaBox.getLowerLeft_x()) * sx,
- float(self.mediaBox.getLowerLeft_y()) * sy,
- float(self.mediaBox.getUpperRight_x()) * sx,
- float(self.mediaBox.getUpperRight_y()) * sy])
- if "/VP" in self:
- viewport = self["/VP"]
- if isinstance(viewport, ArrayObject):
- bbox = viewport[0]["/BBox"]
- else:
- bbox = viewport["/BBox"]
- scaled_bbox = RectangleObject([
- float(bbox[0]) * sx,
- float(bbox[1]) * sy,
- float(bbox[2]) * sx,
- float(bbox[3]) * sy])
- if isinstance(viewport, ArrayObject):
- self[NameObject("/VP")][NumberObject(0)][NameObject("/BBox")] = scaled_bbox
- else:
- self[NameObject("/VP")][NameObject("/BBox")] = scaled_bbox
- def scaleBy(self, factor):
- """
- Scales a page by the given factor by appling a transformation
- matrix to its content and updating the page size.
- :param float factor: The scaling factor (for both X and Y axis).
- """
- self.scale(factor, factor)
- def scaleTo(self, width, height):
- """
- Scales a page to the specified dimentions by appling a
- transformation matrix to its content and updating the page size.
- :param float width: The new width.
- :param float height: The new heigth.
- """
- sx = width / float(self.mediaBox.getUpperRight_x() -
- self.mediaBox.getLowerLeft_x ())
- sy = height / float(self.mediaBox.getUpperRight_y() -
- self.mediaBox.getLowerLeft_y ())
- self.scale(sx, sy)
- def compressContentStreams(self):
- """
- Compresses the size of this page by joining all content streams and
- applying a FlateDecode filter.
- However, it is possible that this function will perform no action if
- content stream compression becomes "automatic" for some reason.
- """
- content = self.getContents()
- if content is not None:
- if not isinstance(content, ContentStream):
- content = ContentStream(content, self.pdf)
- self[NameObject("/Contents")] = content.flateEncode()
- def extractText(self):
- """
- Locate all text drawing commands, in the order they are provided in the
- content stream, and extract the text. This works well for some PDF
- files, but poorly for others, depending on the generator used. This will
- be refined in the future. Do not rely on the order of text coming out of
- this function, as it will change if this function is made more
- sophisticated.
- :return: a unicode string object.
- """
- text = u_("")
- content = self["/Contents"].getObject()
- if not isinstance(content, ContentStream):
- content = ContentStream(content, self.pdf)
- # Note: we check all strings are TextStringObjects. ByteStringObjects
- # are strings where the byte->string encoding was unknown, so adding
- # them to the text here would be gibberish.
- for operands, operator in content.operations:
- if operator == b_("Tj"):
- _text = operands[0]
- if isinstance(_text, TextStringObject):
- text += _text
- text += "\n"
- elif operator == b_("T*"):
- text += "\n"
- elif operator == b_("'"):
- text += "\n"
- _text = operands[0]
- if isinstance(_text, TextStringObject):
- text += operands[0]
- elif operator == b_('"'):
- _text = operands[2]
- if isinstance(_text, TextStringObject):
- text += "\n"
- text += _text
- elif operator == b_("TJ"):
- for i in operands[0]:
- if isinstance(i, TextStringObject):
- text += " "
- text += i
- text += "\n"
- return text
- mediaBox = createRectangleAccessor("/MediaBox", ())
- """
- A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
- defining the boundaries of the physical medium on which the page is
- intended to be displayed or printed.
- """
- cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",))
- """
- A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
- defining the visible region of default user space. When the page is
- displayed or printed, its contents are to be clipped (cropped) to this
- rectangle and then imposed on the output medium in some
- implementation-defined manner. Default value: same as :attr:`mediaBox<mediaBox>`.
- """
- bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox"))
- """
- A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
- defining the region to which the contents of the page should be clipped
- when output in a production enviroment.
- """
- trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox"))
- """
- A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
- defining the intended dimensions of the finished page after trimming.
- """
- artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox"))
- """
- A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
- defining the extent of the page's meaningful content as intended by the
- page's creator.
- """
- class ContentStream(DecodedStreamObject):
- def __init__(self, stream, pdf):
- self.pdf = pdf
- self.operations = []
- # stream may be a StreamObject or an ArrayObject containing
- # multiple StreamObjects to be cat'd together.
- stream = stream.getObject()
- if isinstance(stream, ArrayObject):
- data = b_("")
- for s in stream:
- data += b_(s.getObject().getData())
- stream = BytesIO(b_(data))
- else:
- stream = BytesIO(b_(stream.getData()))
- self.__parseContentStream(stream)
- def __parseContentStream(self, stream):
- # file("f:\\tmp.txt", "w").write(stream.read())
- stream.seek(0, 0)
- operands = []
- while True:
- peek = readNonWhitespace(stream)
- if peek == b_('') or ord_(peek) == 0:
- break
- stream.seek(-1, 1)
- if peek.isalpha() or peek == b_("'") or peek == b_('"'):
- operator = utils.readUntilRegex(stream,
- NameObject.delimiterPattern, True)
- if operator == b_("BI"):
- # begin inline image - a completely different parsing
- # mechanism is required, of course... thanks buddy...
- assert operands == []
- ii = self._readInlineImage(stream)
- self.operations.append((ii, b_("INLINE IMAGE")))
- else:
- self.operations.append((operands, operator))
- operands = []
- elif peek == b_('%'):
- # If we encounter a comment in the content stream, we have to
- # handle it here. Typically, readObject will handle
- # encountering a comment -- but readObject assumes that
- # following the comment must be the object we're trying to
- # read. In this case, it could be an operator instead.
- while peek not in (b_('\r'), b_('\n')):
- peek = stream.read(1)
- else:
- operands.append(readObject(stream, None))
- def _readInlineImage(self, stream):
- # begin reading just after the "BI" - begin image
- # first read the dictionary of settings.
- settings = DictionaryObject()
- while True:
- tok = readNonWhitespace(stream)
- stream.seek(-1, 1)
- if tok == b_("I"):
- # "ID" - begin of image data
- break
- key = readObject(stream, self.pdf)
- tok = readNonWhitespace(stream)
- stream.seek(-1, 1)
- value = readObject(stream, self.pdf)
- settings[key] = value
- # left at beginning of ID
- tmp = stream.read(3)
- assert tmp[:2] == b_("ID")
- data = b_("")
- while True:
- # Read the inline image, while checking for EI (End Image) operator.
- tok = stream.read(1)
- if tok == b_("E"):
- # Check for End Image
- tok2 = stream.read(1)
- if tok2 == b_("I"):
- # Data can contain EI, so check for the Q operator.
- tok3 = stream.read(1)
- info = tok + tok2
- # We need to find whitespace between EI and Q.
- has_q_whitespace = False
- while tok3 in utils.WHITESPACES:
- has_q_whitespace = True
- info += tok3
- tok3 = stream.read(1)
- if tok3 == b_("Q") and has_q_whitespace:
- stream.seek(-1, 1)
- break
- else:
- stream.seek(-1,1)
- data += info
- else:
- stream.seek(-1, 1)
- data += tok
- else:
- data += tok
- return {"settings": settings, "data": data}
- def _getData(self):
- newdata = BytesIO()
- for operands, operator in self.operations:
- if operator == b_("INLINE IMAGE"):
- newdata.write(b_("BI"))
- dicttext = BytesIO()
- operands["settings"].writeToStream(dicttext, None)
- newdata.write(dicttext.getvalue()[2:-2])
- newdata.write(b_("ID "))
- newdata.write(operands["data"])
- newdata.write(b_("EI"))
- else:
- for op in operands:
- op.writeToStream(newdata, None)
- newdata.write(b_(" "))
- newdata.write(b_(operator))
- newdata.write(b_("\n"))
- return newdata.getvalue()
- def _setData(self, value):
- self.__parseContentStream(BytesIO(b_(value)))
- _data = property(_getData, _setData)
- class DocumentInformation(DictionaryObject):
- """
- A class representing the basic document metadata provided in a PDF File.
- This class is accessible through
- :meth:`getDocumentInfo()<PyPDF2.PdfFileReader.getDocumentInfo()>`
- All text properties of the document metadata have
- *two* properties, eg. author and author_raw. The non-raw property will
- always return a ``TextStringObject``, making it ideal for a case where
- the metadata is being displayed. The raw property can sometimes return
- a ``ByteStringObject``, if PyPDF2 was unable to decode the string's
- text encoding; this requires additional safety in the caller and
- therefore is not as commonly accessed.
- """
- def __init__(self):
- DictionaryObject.__init__(self)
- def getText(self, key):
- retval = self.get(key, None)
- if isinstance(retval, TextStringObject):
- return retval
- return None
- title = property(lambda self: self.getText("/Title"))
- """Read-only property accessing the document's **title**.
- Returns a unicode string (``TextStringObject``) or ``None``
- if the title is not specified."""
- title_raw = property(lambda self: self.get("/Title"))
- """The "raw" version of title; can return a ``ByteStringObject``."""
- author = property(lambda self: self.getText("/Author"))
- """Read-only property accessing the document's **author**.
- Returns a unicode string (``TextStringObject``) or ``None``
- if the author is not specified."""
- author_raw = property(lambda self: self.get("/Author"))
- """The "raw" version of author; can return a ``ByteStringObject``."""
- subject = property(lambda self: self.getText("/Subject"))
- """Read-only property accessing the document's **subject**.
- Returns a unicode string (``TextStringObject``) or ``None``
- if the subject is not specified."""
- subject_raw = property(lambda self: self.get("/Subject"))
- """The "raw" version of subject; can return a ``ByteStringObject``."""
- creator = property(lambda self: self.getText("/Creator"))
- """Read-only property accessing the document's **creator**. If the
- document was converted to PDF from another format, this is the name of the
- application (e.g. OpenOffice) that created the original document from
- which it was converted. Returns a unicode string (``TextStringObject``)
- or ``None`` if the creator is not specified."""
- creator_raw = property(lambda self: self.get("/Creator"))
- """The "raw" version of creator; can return a ``ByteStringObject``."""
- producer = property(lambda self: self.getText("/Producer"))
- """Read-only property accessing the document's **producer**.
- If the document was converted to PDF from another format, this is
- the name of the application (for example, OSX Quartz) that converted
- it to PDF. Returns a unicode string (``TextStringObject``)
- or ``None`` if the producer is not specified."""
- producer_raw = property(lambda self: self.get("/Producer"))
- """The "raw" version of producer; can return a ``ByteStringObject``."""
- def convertToInt(d, size):
- if size > 8:
- raise utils.PdfReadError("invalid size in convertToInt")
- d = b_("\x00\x00\x00\x00\x00\x00\x00\x00") + b_(d)
- d = d[-8:]
- return struct.unpack(">q", d)[0]
- # ref: pdf1.8 spec section 3.5.2 algorithm 3.2
- _encryption_padding = b_('\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56') + \
- b_('\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c') + \
- b_('\xa9\xfe\x64\x53\x69\x7a')
- # Implementation of algorithm 3.2 of the PDF standard security handler,
- # section 3.5.2 of the PDF 1.6 reference.
- def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True):
- # 1. Pad or truncate the password string to exactly 32 bytes. If the
- # password string is more than 32 bytes long, use only its first 32 bytes;
- # if it is less than 32 bytes long, pad it by appending the required number
- # of additional bytes from the beginning of the padding string
- # (_encryption_padding).
- password = b_((str_(password) + str_(_encryption_padding))[:32])
- # 2. Initialize the MD5 hash function and pass the result of step 1 as
- # input to this function.
- import struct
- m = md5(password)
- # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash
- # function.
- m.update(owner_entry.original_bytes)
- # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass
- # these bytes to the MD5 hash function, low-order byte first.
- p_entry = struct.pack('<i', p_entry)
- m.update(p_entry)
- # 5. Pass the first element of the file's file identifier array to the MD5
- # hash function.
- m.update(id1_entry.original_bytes)
- # 6. (Revision 3 or greater) If document metadata is not being encrypted,
- # pass 4 bytes with the value 0xFFFFFFFF to the MD5 hash function.
- if rev >= 3 and not metadata_encrypt:
- m.update(b_("\xff\xff\xff\xff"))
- # 7. Finish the hash.
- md5_hash = m.digest()
- # 8. (Revision 3 or greater) Do the following 50 times: Take the output
- # from the previous MD5 hash and pass the first n bytes of the output as
- # input into a new MD5 hash, where n is the number of bytes of the
- # encryption key as defined by the value of the encryption dictionary's
- # /Length entry.
- if rev >= 3:
- for _ in range(50):
- md5_hash = md5(md5_hash[:keylen]).digest()
- # 9. Set the encryption key to the first n bytes of the output from the
- # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or
- # greater, depends on the value of the encryption dictionary's /Length
- # entry.
- return md5_hash[:keylen]
- # Implementation of algorithm 3.3 of the PDF standard security handler,
- # section 3.5.2 of the PDF 1.6 reference.
- def _alg33(owner_pwd, user_pwd, rev, keylen):
- # steps 1 - 4
- key = _alg33_1(owner_pwd, rev, keylen)
- # 5. Pad or truncate the user password string as described in step 1 of
- # algorithm 3.2.
- user_pwd = b_((user_pwd + str_(_encryption_padding))[:32])
- # 6. Encrypt the result of step 5, using an RC4 encryption function with
- # the encryption key obtained in step 4.
- val = utils.RC4_encrypt(key, user_pwd)
- # 7. (Revision 3 or greater) Do the following 19 times: Take the output
- # from the previous invocation of the RC4 function and pass it as input to
- # a new invocation of the function; use an encryption key generated by
- # taking each byte of the encryption key obtained in step 4 and performing
- # an XOR operation between that byte and the single-byte value of the
- # iteration counter (from 1 to 19).
- if rev >= 3:
- for i in range(1, 20):
- new_key = ''
- for l in range(len(key)):
- new_key += chr(ord_(key[l]) ^ i)
- val = utils.RC4_encrypt(new_key, val)
- # 8. Store the output from the final invocation of the RC4 as the value of
- # the /O entry in the encryption dictionary.
- return val
- # Steps 1-4 of algorithm 3.3
- def _alg33_1(password, rev, keylen):
- # 1. Pad or truncate the owner password string as described in step 1 of
- # algorithm 3.2. If there is no owner password, use the user password
- # instead.
- password = b_((password + str_(_encryption_padding))[:32])
- # 2. Initialize the MD5 hash function and pass the result of step 1 as
- # input to this function.
- m = md5(password)
- # 3. (Revision 3 or greater) Do the following 50 times: Take the output
- # from the previous MD5 hash and pass it as input into a new MD5 hash.
- md5_hash = m.digest()
- if rev >= 3:
- for _ in range(50):
- md5_hash = md5(md5_hash).digest()
- # 4. Create an RC4 encryption key using the first n bytes of the output
- # from the final MD5 hash, where n is always 5 for revision 2 but, for
- # revision 3 or greater, depends on the value of the encryption
- # dictionary's /Length entry.
- key = md5_hash[:keylen]
- return key
- # Implementation of algorithm 3.4 of the PDF standard security handler,
- # section 3.5.2 of the PDF 1.6 reference.
- def _alg34(password, owner_entry, p_entry, id1_entry):
- # 1. Create an encryption key based on the user password string, as
- # described in algorithm 3.2.
- key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry)
- # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2,
- # using an RC4 encryption function with the encryption key from the
- # preceding step.
- U = utils.RC4_encrypt(key, _encryption_padding)
- # 3. Store the result of step 2 as the value of the /U entry in the
- # encryption dictionary.
- return U, key
- # Implementation of algorithm 3.4 of the PDF standard security handler,
- # section 3.5.2 of the PDF 1.6 reference.
- def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt):
- # 1. Create an encryption key based on the user password string, as
- # described in Algorithm 3.2.
- key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
- # 2. Initialize the MD5 hash function and pass the 32-byte padding string
- # shown in step 1 of Algorithm 3.2 as input to this function.
- m = md5()
- m.update(_encryption_padding)
- # 3. Pass the first element of the file's file identifier array (the value
- # of the ID entry in the document's trailer dictionary; see Table 3.13 on
- # page 73) to the hash function and finish the hash. (See implementation
- # note 25 in Appendix H.)
- m.update(id1_entry.original_bytes)
- md5_hash = m.digest()
- # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption
- # function with the encryption key from step 1.
- val = utils.RC4_encrypt(key, md5_hash)
- # 5. Do the following 19 times: Take the output from the previous
- # invocation of the RC4 function and pass it as input to a new invocation
- # of the function; use an encryption key generated by taking each byte of
- # the original encryption key (obtained in step 2) and performing an XOR
- # operation between that byte and the single-byte value of the iteration
- # counter (from 1 to 19).
- for i in range(1, 20):
- new_key = b_('')
- for k in key:
- new_key += b_(chr(ord_(k) ^ i))
- val = utils.RC4_encrypt(new_key, val)
- # 6. Append 16 bytes of arbitrary padding to the output from the final
- # invocation of the RC4 function and store the 32-byte result as the value
- # of the U entry in the encryption dictionary.
- # (implementator note: I don't know what "arbitrary padding" is supposed to
- # mean, so I have used null bytes. This seems to match a few other
- # people's implementations)
- return val + (b_('\x00') * 16), key
|