123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227 |
- # vim: sw=4:expandtab:foldmethod=marker
- #
- # Copyright (c) 2006, Mathieu Fenniak
- # All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are
- # met:
- #
- # * Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- # * Redistributions in binary form must reproduce the above copyright notice,
- # this list of conditions and the following disclaimer in the documentation
- # and/or other materials provided with the distribution.
- # * The name of the author may not be used to endorse or promote products
- # derived from this software without specific prior written permission.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- # POSSIBILITY OF SUCH DAMAGE.
- """
- Implementation of generic PDF objects (dictionary, number, string, and so on)
- """
- __author__ = "Mathieu Fenniak"
- __author_email__ = "biziqe@mathieu.fenniak.net"
- import re
- from .utils import readNonWhitespace, RC4_encrypt, skipOverComment
- from .utils import b_, u_, chr_, ord_
- from .utils import PdfStreamError
- import warnings
- from . import filters
- from . import utils
- import decimal
- import codecs
- ObjectPrefix = b_('/<[tf(n%')
- NumberSigns = b_('+-')
- IndirectPattern = re.compile(b_(r"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]"))
- def readObject(stream, pdf):
- tok = stream.read(1)
- stream.seek(-1, 1) # reset to start
- idx = ObjectPrefix.find(tok)
- if idx == 0:
- # name object
- return NameObject.readFromStream(stream, pdf)
- elif idx == 1:
- # hexadecimal string OR dictionary
- peek = stream.read(2)
- stream.seek(-2, 1) # reset to start
- if peek == b_('<<'):
- return DictionaryObject.readFromStream(stream, pdf)
- else:
- return readHexStringFromStream(stream)
- elif idx == 2:
- # array object
- return ArrayObject.readFromStream(stream, pdf)
- elif idx == 3 or idx == 4:
- # boolean object
- return BooleanObject.readFromStream(stream)
- elif idx == 5:
- # string object
- return readStringFromStream(stream)
- elif idx == 6:
- # null object
- return NullObject.readFromStream(stream)
- elif idx == 7:
- # comment
- while tok not in (b_('\r'), b_('\n')):
- tok = stream.read(1)
- # Prevents an infinite loop by raising an error if the stream is at
- # the EOF
- if len(tok) <= 0:
- raise PdfStreamError("File ended unexpectedly.")
- tok = readNonWhitespace(stream)
- stream.seek(-1, 1)
- return readObject(stream, pdf)
- else:
- # number object OR indirect reference
- peek = stream.read(20)
- stream.seek(-len(peek), 1) # reset to start
- if IndirectPattern.match(peek) != None:
- return IndirectObject.readFromStream(stream, pdf)
- else:
- return NumberObject.readFromStream(stream)
- class PdfObject(object):
- def getObject(self):
- """Resolves indirect references."""
- return self
- class NullObject(PdfObject):
- def writeToStream(self, stream, encryption_key):
- stream.write(b_("null"))
- def readFromStream(stream):
- nulltxt = stream.read(4)
- if nulltxt != b_("null"):
- raise utils.PdfReadError("Could not read Null object")
- return NullObject()
- readFromStream = staticmethod(readFromStream)
- class BooleanObject(PdfObject):
- def __init__(self, value):
- self.value = value
- def writeToStream(self, stream, encryption_key):
- if self.value:
- stream.write(b_("true"))
- else:
- stream.write(b_("false"))
- def readFromStream(stream):
- word = stream.read(4)
- if word == b_("true"):
- return BooleanObject(True)
- elif word == b_("fals"):
- stream.read(1)
- return BooleanObject(False)
- else:
- raise utils.PdfReadError('Could not read Boolean object')
- readFromStream = staticmethod(readFromStream)
- class ArrayObject(list, PdfObject):
- def writeToStream(self, stream, encryption_key):
- stream.write(b_("["))
- for data in self:
- stream.write(b_(" "))
- data.writeToStream(stream, encryption_key)
- stream.write(b_(" ]"))
- def readFromStream(stream, pdf):
- arr = ArrayObject()
- tmp = stream.read(1)
- if tmp != b_("["):
- raise utils.PdfReadError("Could not read array")
- while True:
- # skip leading whitespace
- tok = stream.read(1)
- while tok.isspace():
- tok = stream.read(1)
- stream.seek(-1, 1)
- # check for array ending
- peekahead = stream.read(1)
- if peekahead == b_("]"):
- break
- stream.seek(-1, 1)
- # read and append obj
- arr.append(readObject(stream, pdf))
- return arr
- readFromStream = staticmethod(readFromStream)
- class IndirectObject(PdfObject):
- def __init__(self, idnum, generation, pdf):
- self.idnum = idnum
- self.generation = generation
- self.pdf = pdf
- def getObject(self):
- return self.pdf.getObject(self).getObject()
- def __repr__(self):
- return "IndirectObject(%r, %r)" % (self.idnum, self.generation)
- def __eq__(self, other):
- return (
- other != None and
- isinstance(other, IndirectObject) and
- self.idnum == other.idnum and
- self.generation == other.generation and
- self.pdf is other.pdf
- )
- def __ne__(self, other):
- return not self.__eq__(other)
- def writeToStream(self, stream, encryption_key):
- stream.write(b_("%s %s R" % (self.idnum, self.generation)))
- def readFromStream(stream, pdf):
- idnum = b_("")
- while True:
- tok = stream.read(1)
- if not tok:
- # stream has truncated prematurely
- raise PdfStreamError("Stream has ended unexpectedly")
- if tok.isspace():
- break
- idnum += tok
- generation = b_("")
- while True:
- tok = stream.read(1)
- if not tok:
- # stream has truncated prematurely
- raise PdfStreamError("Stream has ended unexpectedly")
- if tok.isspace():
- if not generation:
- continue
- break
- generation += tok
- r = readNonWhitespace(stream)
- if r != b_("R"):
- raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell()))
- return IndirectObject(int(idnum), int(generation), pdf)
- readFromStream = staticmethod(readFromStream)
- class FloatObject(decimal.Decimal, PdfObject):
- def __new__(cls, value="0", context=None):
- try:
- return decimal.Decimal.__new__(cls, utils.str_(value), context)
- except Exception:
- return decimal.Decimal.__new__(cls, str(value))
- def __repr__(self):
- if self == self.to_integral():
- return str(self.quantize(decimal.Decimal(1)))
- else:
- # Standard formatting adds useless extraneous zeros.
- o = "%.5f" % self
- # Remove the zeros.
- while o and o[-1] == '0':
- o = o[:-1]
- return o
- def as_numeric(self):
- return float(b_(repr(self)))
- def writeToStream(self, stream, encryption_key):
- stream.write(b_(repr(self)))
- class NumberObject(int, PdfObject):
- NumberPattern = re.compile(b_('[^+-.0-9]'))
- ByteDot = b_(".")
- def __new__(cls, value):
- val = int(value)
- try:
- return int.__new__(cls, val)
- except OverflowError:
- return int.__new__(cls, 0)
- def as_numeric(self):
- return int(b_(repr(self)))
- def writeToStream(self, stream, encryption_key):
- stream.write(b_(repr(self)))
- def readFromStream(stream):
- num = utils.readUntilRegex(stream, NumberObject.NumberPattern)
- if num.find(NumberObject.ByteDot) != -1:
- return FloatObject(num)
- else:
- return NumberObject(num)
- readFromStream = staticmethod(readFromStream)
- ##
- # Given a string (either a "str" or "unicode"), create a ByteStringObject or a
- # TextStringObject to represent the string.
- def createStringObject(string):
- if isinstance(string, utils.string_type):
- return TextStringObject(string)
- elif isinstance(string, utils.bytes_type):
- try:
- if string.startswith(codecs.BOM_UTF16_BE):
- retval = TextStringObject(string.decode("utf-16"))
- retval.autodetect_utf16 = True
- return retval
- else:
- # This is probably a big performance hit here, but we need to
- # convert string objects into the text/unicode-aware version if
- # possible... and the only way to check if that's possible is
- # to try. Some strings are strings, some are just byte arrays.
- retval = TextStringObject(decode_pdfdocencoding(string))
- retval.autodetect_pdfdocencoding = True
- return retval
- except UnicodeDecodeError:
- return ByteStringObject(string)
- else:
- raise TypeError("createStringObject should have str or unicode arg")
- def readHexStringFromStream(stream):
- stream.read(1)
- txt = ""
- x = b_("")
- while True:
- tok = readNonWhitespace(stream)
- if not tok:
- # stream has truncated prematurely
- raise PdfStreamError("Stream has ended unexpectedly")
- if tok == b_(">"):
- break
- x += tok
- if len(x) == 2:
- txt += chr(int(x, base=16))
- x = b_("")
- if len(x) == 1:
- x += b_("0")
- if len(x) == 2:
- txt += chr(int(x, base=16))
- return createStringObject(b_(txt))
- def readStringFromStream(stream):
- tok = stream.read(1)
- parens = 1
- txt = b_("")
- while True:
- tok = stream.read(1)
- if not tok:
- # stream has truncated prematurely
- raise PdfStreamError("Stream has ended unexpectedly")
- if tok == b_("("):
- parens += 1
- elif tok == b_(")"):
- parens -= 1
- if parens == 0:
- break
- elif tok == b_("\\"):
- tok = stream.read(1)
- ESCAPE_DICT = {b_("n") : b_("\n"),
- b_("r") : b_("\r"),
- b_("t") : b_("\t"),
- b_("b") : b_("\b"),
- b_("f") : b_("\f"),
- b_("c") : b_(r"\c"),
- b_("(") : b_("("),
- b_(")") : b_(")"),
- b_("/") : b_("/"),
- b_("\\") : b_("\\"),
- b_(" ") : b_(" "),
- b_("/") : b_("/"),
- b_("%") : b_("%"),
- b_("<") : b_("<"),
- b_(">") : b_(">"),
- b_("[") : b_("["),
- b_("]") : b_("]"),
- b_("#") : b_("#"),
- b_("_") : b_("_"),
- b_("&") : b_("&"),
- b_('$') : b_('$'),
- }
- try:
- tok = ESCAPE_DICT[tok]
- except KeyError:
- if tok.isdigit():
- # "The number ddd may consist of one, two, or three
- # octal digits; high-order overflow shall be ignored.
- # Three octal digits shall be used, with leading zeros
- # as needed, if the next character of the string is also
- # a digit." (PDF reference 7.3.4.2, p 16)
- for _ in range(2):
- ntok = stream.read(1)
- if ntok.isdigit():
- tok += ntok
- else:
- break
- tok = b_(chr(int(tok, base=8)))
- elif tok in b_("\n\r"):
- # This case is hit when a backslash followed by a line
- # break occurs. If it's a multi-char EOL, consume the
- # second character:
- tok = stream.read(1)
- if not tok in b_("\n\r"):
- stream.seek(-1, 1)
- # Then don't add anything to the actual string, since this
- # line break was escaped:
- tok = b_('')
- else:
- raise utils.PdfReadError(r"Unexpected escaped string: %s" % tok)
- txt += tok
- return createStringObject(txt)
- ##
- # Represents a string object where the text encoding could not be determined.
- # This occurs quite often, as the PDF spec doesn't provide an alternate way to
- # represent strings -- for example, the encryption data stored in files (like
- # /O) is clearly not text, but is still stored in a "String" object.
- class ByteStringObject(utils.bytes_type, PdfObject):
- ##
- # For compatibility with TextStringObject.original_bytes. This method
- # returns self.
- original_bytes = property(lambda self: self)
- def writeToStream(self, stream, encryption_key):
- bytearr = self
- if encryption_key:
- bytearr = RC4_encrypt(encryption_key, bytearr)
- stream.write(b_("<"))
- stream.write(utils.hexencode(bytearr))
- stream.write(b_(">"))
- ##
- # Represents a string object that has been decoded into a real unicode string.
- # If read from a PDF document, this string appeared to match the
- # PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to
- # occur.
- class TextStringObject(utils.string_type, PdfObject):
- autodetect_pdfdocencoding = False
- autodetect_utf16 = False
- ##
- # It is occasionally possible that a text string object gets created where
- # a byte string object was expected due to the autodetection mechanism --
- # if that occurs, this "original_bytes" property can be used to
- # back-calculate what the original encoded bytes were.
- original_bytes = property(lambda self: self.get_original_bytes())
- def get_original_bytes(self):
- # We're a text string object, but the library is trying to get our raw
- # bytes. This can happen if we auto-detected this string as text, but
- # we were wrong. It's pretty common. Return the original bytes that
- # would have been used to create this object, based upon the autodetect
- # method.
- if self.autodetect_utf16:
- return codecs.BOM_UTF16_BE + self.encode("utf-16be")
- elif self.autodetect_pdfdocencoding:
- return encode_pdfdocencoding(self)
- else:
- raise Exception("no information about original bytes")
- def writeToStream(self, stream, encryption_key):
- # Try to write the string out as a PDFDocEncoding encoded string. It's
- # nicer to look at in the PDF file. Sadly, we take a performance hit
- # here for trying...
- try:
- bytearr = encode_pdfdocencoding(self)
- except UnicodeEncodeError:
- bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
- if encryption_key:
- bytearr = RC4_encrypt(encryption_key, bytearr)
- obj = ByteStringObject(bytearr)
- obj.writeToStream(stream, None)
- else:
- stream.write(b_("("))
- for c in bytearr:
- if not chr_(c).isalnum() and c != b_(' '):
- stream.write(b_("\\%03o" % ord_(c)))
- else:
- stream.write(b_(chr_(c)))
- stream.write(b_(")"))
- class NameObject(str, PdfObject):
- delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]"))
- surfix = b_("/")
- def writeToStream(self, stream, encryption_key):
- stream.write(b_(self))
- def readFromStream(stream, pdf):
- debug = False
- if debug: print((stream.tell()))
- name = stream.read(1)
- if name != NameObject.surfix:
- raise utils.PdfReadError("name read error")
- name += utils.readUntilRegex(stream, NameObject.delimiterPattern,
- ignore_eof=True)
- if debug: print(name)
- try:
- return NameObject(name.decode('utf-8'))
- except (UnicodeEncodeError, UnicodeDecodeError) as e:
- # Name objects should represent irregular characters
- # with a '#' followed by the symbol's hex number
- if not pdf.strict:
- warnings.warn("Illegal character in Name Object", utils.PdfReadWarning)
- return NameObject(name)
- else:
- raise utils.PdfReadError("Illegal character in Name Object")
- readFromStream = staticmethod(readFromStream)
- class DictionaryObject(dict, PdfObject):
- def raw_get(self, key):
- return dict.__getitem__(self, key)
- def __setitem__(self, key, value):
- if not isinstance(key, PdfObject):
- raise ValueError("key must be PdfObject")
- if not isinstance(value, PdfObject):
- raise ValueError("value must be PdfObject")
- return dict.__setitem__(self, key, value)
- def setdefault(self, key, value=None):
- if not isinstance(key, PdfObject):
- raise ValueError("key must be PdfObject")
- if not isinstance(value, PdfObject):
- raise ValueError("value must be PdfObject")
- return dict.setdefault(self, key, value)
- def __getitem__(self, key):
- return dict.__getitem__(self, key).getObject()
- ##
- # Retrieves XMP (Extensible Metadata Platform) data relevant to the
- # this object, if available.
- # <p>
- # Stability: Added in v1.12, will exist for all future v1.x releases.
- # @return Returns a {@link #xmp.XmpInformation XmlInformation} instance
- # that can be used to access XMP metadata from the document. Can also
- # return None if no metadata was found on the document root.
- def getXmpMetadata(self):
- metadata = self.get("/Metadata", None)
- if metadata == None:
- return None
- metadata = metadata.getObject()
- from . import xmp
- if not isinstance(metadata, xmp.XmpInformation):
- metadata = xmp.XmpInformation(metadata)
- self[NameObject("/Metadata")] = metadata
- return metadata
- ##
- # Read-only property that accesses the {@link
- # #DictionaryObject.getXmpData getXmpData} function.
- # <p>
- # Stability: Added in v1.12, will exist for all future v1.x releases.
- xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None)
- def writeToStream(self, stream, encryption_key):
- stream.write(b_("<<\n"))
- for key, value in list(self.items()):
- key.writeToStream(stream, encryption_key)
- stream.write(b_(" "))
- value.writeToStream(stream, encryption_key)
- stream.write(b_("\n"))
- stream.write(b_(">>"))
- def readFromStream(stream, pdf):
- debug = False
- tmp = stream.read(2)
- if tmp != b_("<<"):
- raise utils.PdfReadError("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell()))
- data = {}
- while True:
- tok = readNonWhitespace(stream)
- if tok == b_('\x00'):
- continue
- elif tok == b_('%'):
- stream.seek(-1, 1)
- skipOverComment(stream)
- continue
- if not tok:
- # stream has truncated prematurely
- raise PdfStreamError("Stream has ended unexpectedly")
- if debug: print(("Tok:", tok))
- if tok == b_(">"):
- stream.read(1)
- break
- stream.seek(-1, 1)
- key = readObject(stream, pdf)
- tok = readNonWhitespace(stream)
- stream.seek(-1, 1)
- value = readObject(stream, pdf)
- if not data.get(key):
- data[key] = value
- elif pdf.strict:
- # multiple definitions of key not permitted
- raise utils.PdfReadError("Multiple definitions in dictionary at byte %s for key %s" \
- % (utils.hexStr(stream.tell()), key))
- else:
- warnings.warn("Multiple definitions in dictionary at byte %s for key %s" \
- % (utils.hexStr(stream.tell()), key), utils.PdfReadWarning)
- pos = stream.tell()
- s = readNonWhitespace(stream)
- if s == b_('s') and stream.read(5) == b_('tream'):
- eol = stream.read(1)
- # odd PDF file output has spaces after 'stream' keyword but before EOL.
- # patch provided by Danial Sandler
- while eol == b_(' '):
- eol = stream.read(1)
- assert eol in (b_("\n"), b_("\r"))
- if eol == b_("\r"):
- # read \n after
- if stream.read(1) != b_('\n'):
- stream.seek(-1, 1)
- # this is a stream object, not a dictionary
- assert "/Length" in data
- length = data["/Length"]
- if debug: print(data)
- if isinstance(length, IndirectObject):
- t = stream.tell()
- length = pdf.getObject(length)
- stream.seek(t, 0)
- data["__streamdata__"] = stream.read(length)
- if debug: print("here")
- #if debug: print(binascii.hexlify(data["__streamdata__"]))
- e = readNonWhitespace(stream)
- ndstream = stream.read(8)
- if (e + ndstream) != b_("endstream"):
- # (sigh) - the odd PDF file has a length that is too long, so
- # we need to read backwards to find the "endstream" ending.
- # ReportLab (unknown version) generates files with this bug,
- # and Python users into PDF files tend to be our audience.
- # we need to do this to correct the streamdata and chop off
- # an extra character.
- pos = stream.tell()
- stream.seek(-10, 1)
- end = stream.read(9)
- if end == b_("endstream"):
- # we found it by looking back one character further.
- data["__streamdata__"] = data["__streamdata__"][:-1]
- else:
- if debug: print(("E", e, ndstream, debugging.toHex(end)))
- stream.seek(pos, 0)
- raise utils.PdfReadError("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell()))
- else:
- stream.seek(pos, 0)
- if "__streamdata__" in data:
- return StreamObject.initializeFromDictionary(data)
- else:
- retval = DictionaryObject()
- retval.update(data)
- return retval
- readFromStream = staticmethod(readFromStream)
- class TreeObject(DictionaryObject):
- def __init__(self):
- DictionaryObject.__init__(self)
- def hasChildren(self):
- return '/First' in self
- def __iter__(self):
- return self.children()
- def children(self):
- if not self.hasChildren():
- raise StopIteration
- child = self['/First']
- while True:
- yield child
- if child == self['/Last']:
- raise StopIteration
- child = child['/Next']
- def addChild(self, child, pdf):
- childObj = child.getObject()
- child = pdf.getReference(childObj)
- assert isinstance(child, IndirectObject)
- if '/First' not in self:
- self[NameObject('/First')] = child
- self[NameObject('/Count')] = NumberObject(0)
- prev = None
- else:
- prev = self['/Last']
- self[NameObject('/Last')] = child
- self[NameObject('/Count')] = NumberObject(self[NameObject('/Count')] + 1)
- if prev:
- prevRef = pdf.getReference(prev)
- assert isinstance(prevRef, IndirectObject)
- childObj[NameObject('/Prev')] = prevRef
- prev[NameObject('/Next')] = child
- parentRef = pdf.getReference(self)
- assert isinstance(parentRef, IndirectObject)
- childObj[NameObject('/Parent')] = parentRef
- def removeChild(self, child):
- childObj = child.getObject()
- if NameObject('/Parent') not in childObj:
- raise ValueError("Removed child does not appear to be a tree item")
- elif childObj[NameObject('/Parent')] != self:
- raise ValueError("Removed child is not a member of this tree")
- found = False
- prevRef = None
- prev = None
- curRef = self[NameObject('/First')]
- cur = curRef.getObject()
- lastRef = self[NameObject('/Last')]
- last = lastRef.getObject()
- while cur != None:
- if cur == childObj:
- if prev == None:
- if NameObject('/Next') in cur:
- # Removing first tree node
- nextRef = cur[NameObject('/Next')]
- next = nextRef.getObject()
- del next[NameObject('/Prev')]
- self[NameObject('/First')] = nextRef
- self[NameObject('/Count')] = self[NameObject('/Count')] - 1
- else:
- # Removing only tree node
- assert self[NameObject('/Count')] == 1
- del self[NameObject('/Count')]
- del self[NameObject('/First')]
- if NameObject('/Last') in self:
- del self[NameObject('/Last')]
- else:
- if NameObject('/Next') in cur:
- # Removing middle tree node
- nextRef = cur[NameObject('/Next')]
- next = nextRef.getObject()
- next[NameObject('/Prev')] = prevRef
- prev[NameObject('/Next')] = nextRef
- self[NameObject('/Count')] = self[NameObject('/Count')] - 1
- else:
- # Removing last tree node
- assert cur == last
- del prev[NameObject('/Next')]
- self[NameObject('/Last')] = prevRef
- self[NameObject('/Count')] = self[NameObject('/Count')] - 1
- found = True
- break
- prevRef = curRef
- prev = cur
- if NameObject('/Next') in cur:
- curRef = cur[NameObject('/Next')]
- cur = curRef.getObject()
- else:
- curRef = None
- cur = None
- if not found:
- raise ValueError("Removal couldn't find item in tree")
- del childObj[NameObject('/Parent')]
- if NameObject('/Next') in childObj:
- del childObj[NameObject('/Next')]
- if NameObject('/Prev') in childObj:
- del childObj[NameObject('/Prev')]
- def emptyTree(self):
- for child in self:
- childObj = child.getObject()
- del childObj[NameObject('/Parent')]
- if NameObject('/Next') in childObj:
- del childObj[NameObject('/Next')]
- if NameObject('/Prev') in childObj:
- del childObj[NameObject('/Prev')]
- if NameObject('/Count') in self:
- del self[NameObject('/Count')]
- if NameObject('/First') in self:
- del self[NameObject('/First')]
- if NameObject('/Last') in self:
- del self[NameObject('/Last')]
- class StreamObject(DictionaryObject):
- def __init__(self):
- self._data = None
- self.decodedSelf = None
- def writeToStream(self, stream, encryption_key):
- self[NameObject("/Length")] = NumberObject(len(self._data))
- DictionaryObject.writeToStream(self, stream, encryption_key)
- del self["/Length"]
- stream.write(b_("\nstream\n"))
- data = self._data
- if encryption_key:
- data = RC4_encrypt(encryption_key, data)
- stream.write(data)
- stream.write(b_("\nendstream"))
- def initializeFromDictionary(data):
- if "/Filter" in data:
- retval = EncodedStreamObject()
- else:
- retval = DecodedStreamObject()
- retval._data = data["__streamdata__"]
- del data["__streamdata__"]
- del data["/Length"]
- retval.update(data)
- return retval
- initializeFromDictionary = staticmethod(initializeFromDictionary)
- def flateEncode(self):
- if "/Filter" in self:
- f = self["/Filter"]
- if isinstance(f, ArrayObject):
- f.insert(0, NameObject("/FlateDecode"))
- else:
- newf = ArrayObject()
- newf.append(NameObject("/FlateDecode"))
- newf.append(f)
- f = newf
- else:
- f = NameObject("/FlateDecode")
- retval = EncodedStreamObject()
- retval[NameObject("/Filter")] = f
- retval._data = filters.FlateDecode.encode(self._data)
- return retval
- class DecodedStreamObject(StreamObject):
- def getData(self):
- return self._data
- def setData(self, data):
- self._data = data
- class EncodedStreamObject(StreamObject):
- def __init__(self):
- self.decodedSelf = None
- def getData(self):
- if self.decodedSelf:
- # cached version of decoded object
- return self.decodedSelf.getData()
- else:
- # create decoded object
- decoded = DecodedStreamObject()
- decoded._data = filters.decodeStreamData(self)
- for key, value in list(self.items()):
- if not key in ("/Length", "/Filter", "/DecodeParms"):
- decoded[key] = value
- self.decodedSelf = decoded
- return decoded._data
- def setData(self, data):
- raise utils.PdfReadError("Creating EncodedStreamObject is not currently supported")
- class RectangleObject(ArrayObject):
- """
- This class is used to represent *page boxes* in PyPDF2. These boxes include:
- * :attr:`artBox <PyPDF2.pdf.PageObject.artBox>`
- * :attr:`bleedBox <PyPDF2.pdf.PageObject.bleedBox>`
- * :attr:`cropBox <PyPDF2.pdf.PageObject.cropBox>`
- * :attr:`mediaBox <PyPDF2.pdf.PageObject.mediaBox>`
- * :attr:`trimBox <PyPDF2.pdf.PageObject.trimBox>`
- """
- def __init__(self, arr):
- # must have four points
- assert len(arr) == 4
- # automatically convert arr[x] into NumberObject(arr[x]) if necessary
- ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr])
- def ensureIsNumber(self, value):
- if not isinstance(value, (NumberObject, FloatObject)):
- value = FloatObject(value)
- return value
- def __repr__(self):
- return "RectangleObject(%s)" % repr(list(self))
- def getLowerLeft_x(self):
- return self[0]
- def getLowerLeft_y(self):
- return self[1]
- def getUpperRight_x(self):
- return self[2]
- def getUpperRight_y(self):
- return self[3]
- def getUpperLeft_x(self):
- return self.getLowerLeft_x()
- def getUpperLeft_y(self):
- return self.getUpperRight_y()
- def getLowerRight_x(self):
- return self.getUpperRight_x()
- def getLowerRight_y(self):
- return self.getLowerLeft_y()
- def getLowerLeft(self):
- return self.getLowerLeft_x(), self.getLowerLeft_y()
- def getLowerRight(self):
- return self.getLowerRight_x(), self.getLowerRight_y()
- def getUpperLeft(self):
- return self.getUpperLeft_x(), self.getUpperLeft_y()
- def getUpperRight(self):
- return self.getUpperRight_x(), self.getUpperRight_y()
- def setLowerLeft(self, value):
- self[0], self[1] = [self.ensureIsNumber(x) for x in value]
- def setLowerRight(self, value):
- self[2], self[1] = [self.ensureIsNumber(x) for x in value]
- def setUpperLeft(self, value):
- self[0], self[3] = [self.ensureIsNumber(x) for x in value]
- def setUpperRight(self, value):
- self[2], self[3] = [self.ensureIsNumber(x) for x in value]
- def getWidth(self):
- return self.getUpperRight_x() - self.getLowerLeft_x()
- def getHeight(self):
- return self.getUpperRight_y() - self.getLowerLeft_y()
- lowerLeft = property(getLowerLeft, setLowerLeft, None, None)
- """
- Property to read and modify the lower left coordinate of this box
- in (x,y) form.
- """
- lowerRight = property(getLowerRight, setLowerRight, None, None)
- """
- Property to read and modify the lower right coordinate of this box
- in (x,y) form.
- """
- upperLeft = property(getUpperLeft, setUpperLeft, None, None)
- """
- Property to read and modify the upper left coordinate of this box
- in (x,y) form.
- """
- upperRight = property(getUpperRight, setUpperRight, None, None)
- """
- Property to read and modify the upper right coordinate of this box
- in (x,y) form.
- """
- class Field(TreeObject):
- """
- A class representing a field dictionary. This class is accessed through
- :meth:`getFields()<PyPDF2.PdfFileReader.getFields>`
- """
- def __init__(self, data):
- DictionaryObject.__init__(self)
- attributes = ("/FT", "/Parent", "/Kids", "/T", "/TU", "/TM", "/Ff",
- "/V", "/DV", "/AA")
- for attr in attributes:
- try:
- self[NameObject(attr)] = data[attr]
- except KeyError:
- pass
- fieldType = property(lambda self: self.get("/FT"))
- """
- Read-only property accessing the type of this field.
- """
- parent = property(lambda self: self.get("/Parent"))
- """
- Read-only property accessing the parent of this field.
- """
- kids = property(lambda self: self.get("/Kids"))
- """
- Read-only property accessing the kids of this field.
- """
- name = property(lambda self: self.get("/T"))
- """
- Read-only property accessing the name of this field.
- """
- altName = property(lambda self: self.get("/TU"))
- """
- Read-only property accessing the alternate name of this field.
- """
- mappingName = property(lambda self: self.get("/TM"))
- """
- Read-only property accessing the mapping name of this field. This
- name is used by PyPDF2 as a key in the dictionary returned by
- :meth:`getFields()<PyPDF2.PdfFileReader.getFields>`
- """
- flags = property(lambda self: self.get("/Ff"))
- """
- Read-only property accessing the field flags, specifying various
- characteristics of the field (see Table 8.70 of the PDF 1.7 reference).
- """
- value = property(lambda self: self.get("/V"))
- """
- Read-only property accessing the value of this field. Format
- varies based on field type.
- """
- defaultValue = property(lambda self: self.get("/DV"))
- """
- Read-only property accessing the default value of this field.
- """
- additionalActions = property(lambda self: self.get("/AA"))
- """
- Read-only property accessing the additional actions dictionary.
- This dictionary defines the field's behavior in response to trigger events.
- See Section 8.5.2 of the PDF 1.7 reference.
- """
- class Destination(TreeObject):
- """
- A class representing a destination within a PDF file.
- See section 8.2.1 of the PDF 1.6 reference.
- :param str title: Title of this destination.
- :param IndirectObject page: Reference to the page of this destination. Should
- be an instance of :class:`IndirectObject<PyPDF2.generic.IndirectObject>`.
- :param str typ: How the destination is displayed.
- :param args: Additional arguments may be necessary depending on the type.
- :raises PdfReadError: If destination type is invalid.
- Valid ``typ`` arguments (see PDF spec for details):
- /Fit No additional arguments
- /XYZ [left] [top] [zoomFactor]
- /FitH [top]
- /FitV [left]
- /FitR [left] [bottom] [right] [top]
- /FitB No additional arguments
- /FitBH [top]
- /FitBV [left]
- """
- def __init__(self, title, page, typ, *args):
- DictionaryObject.__init__(self)
- self[NameObject("/Title")] = title
- self[NameObject("/Page")] = page
- self[NameObject("/Type")] = typ
- # from table 8.2 of the PDF 1.7 reference.
- if typ == "/XYZ":
- (self[NameObject("/Left")], self[NameObject("/Top")],
- self[NameObject("/Zoom")]) = args
- elif typ == "/FitR":
- (self[NameObject("/Left")], self[NameObject("/Bottom")],
- self[NameObject("/Right")], self[NameObject("/Top")]) = args
- elif typ in ["/FitH", "/FitBH"]:
- self[NameObject("/Top")], = args
- elif typ in ["/FitV", "/FitBV"]:
- self[NameObject("/Left")], = args
- elif typ in ["/Fit", "/FitB"]:
- pass
- else:
- raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
- def getDestArray(self):
- return ArrayObject([self.raw_get('/Page'), self['/Type']] + [self[x] for x in ['/Left', '/Bottom', '/Right', '/Top', '/Zoom'] if x in self])
- def writeToStream(self, stream, encryption_key):
- stream.write(b_("<<\n"))
- key = NameObject('/D')
- key.writeToStream(stream, encryption_key)
- stream.write(b_(" "))
- value = self.getDestArray()
- value.writeToStream(stream, encryption_key)
- key = NameObject("/S")
- key.writeToStream(stream, encryption_key)
- stream.write(b_(" "))
- value = NameObject("/GoTo")
- value.writeToStream(stream, encryption_key)
- stream.write(b_("\n"))
- stream.write(b_(">>"))
- title = property(lambda self: self.get("/Title"))
- """
- Read-only property accessing the destination title.
- :rtype: str
- """
- page = property(lambda self: self.get("/Page"))
- """
- Read-only property accessing the destination page number.
- :rtype: int
- """
- typ = property(lambda self: self.get("/Type"))
- """
- Read-only property accessing the destination type.
- :rtype: str
- """
- zoom = property(lambda self: self.get("/Zoom", None))
- """
- Read-only property accessing the zoom factor.
- :rtype: int, or ``None`` if not available.
- """
- left = property(lambda self: self.get("/Left", None))
- """
- Read-only property accessing the left horizontal coordinate.
- :rtype: int, or ``None`` if not available.
- """
- right = property(lambda self: self.get("/Right", None))
- """
- Read-only property accessing the right horizontal coordinate.
- :rtype: int, or ``None`` if not available.
- """
- top = property(lambda self: self.get("/Top", None))
- """
- Read-only property accessing the top vertical coordinate.
- :rtype: int, or ``None`` if not available.
- """
- bottom = property(lambda self: self.get("/Bottom", None))
- """
- Read-only property accessing the bottom vertical coordinate.
- :rtype: int, or ``None`` if not available.
- """
- class Bookmark(Destination):
- def writeToStream(self, stream, encryption_key):
- stream.write(b_("<<\n"))
- for key in [NameObject(x) for x in ['/Title', '/Parent', '/First', '/Last', '/Next', '/Prev'] if x in self]:
- key.writeToStream(stream, encryption_key)
- stream.write(b_(" "))
- value = self.raw_get(key)
- value.writeToStream(stream, encryption_key)
- stream.write(b_("\n"))
- key = NameObject('/Dest')
- key.writeToStream(stream, encryption_key)
- stream.write(b_(" "))
- value = self.getDestArray()
- value.writeToStream(stream, encryption_key)
- stream.write(b_("\n"))
- stream.write(b_(">>"))
- def encode_pdfdocencoding(unicode_string):
- retval = b_('')
- for c in unicode_string:
- try:
- retval += b_(chr(_pdfDocEncoding_rev[c]))
- except KeyError:
- raise UnicodeEncodeError("pdfdocencoding", c, -1, -1,
- "does not exist in translation table")
- return retval
- def decode_pdfdocencoding(byte_array):
- retval = u_('')
- for b in byte_array:
- c = _pdfDocEncoding[ord_(b)]
- if c == u_('\u0000'):
- raise UnicodeDecodeError("pdfdocencoding", utils.barray(b), -1, -1,
- "does not exist in translation table")
- retval += c
- return retval
- _pdfDocEncoding = (
- u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
- u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
- u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
- u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'),
- u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'),
- u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'),
- u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'),
- u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'),
- u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'),
- u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'),
- u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'),
- u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'),
- u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'),
- u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'),
- u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'),
- u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'),
- u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'),
- u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'),
- u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'),
- u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'),
- u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'),
- u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'),
- u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'),
- u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'),
- u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'),
- u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'),
- u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'),
- u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'),
- u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'),
- u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'),
- u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'),
- u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff')
- )
- assert len(_pdfDocEncoding) == 256
- _pdfDocEncoding_rev = {}
- for i in range(256):
- char = _pdfDocEncoding[i]
- if char == u_("\u0000"):
- continue
- assert char not in _pdfDocEncoding_rev
- _pdfDocEncoding_rev[char] = i
|