123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152 |
- #!/usr/bin/env python
- """
- Representation and utils for ranges of PDF file pages.
- Copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>.
- All rights reserved. This software is available under a BSD license;
- see https://github.com/mstamy2/PyPDF2/blob/master/LICENSE
- """
- import re
- from .utils import isString
- _INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0".
- PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE)
- # groups: 12 34 5 6 7 8
- class ParseError(Exception):
- pass
- PAGE_RANGE_HELP = """Remember, page indices start with zero.
- Page range expression examples:
- : all pages. -1 last page.
- 22 just the 23rd page. :-1 all but the last page.
- 0:3 the first three pages. -2 second-to-last page.
- :3 the first three pages. -2: last two pages.
- 5: from the sixth page onward. -3:-1 third & second to last.
- The third, "stride" or "step" number is also recognized.
- ::2 0 2 4 ... to the end. 3:0:-1 3 2 1 but not 0.
- 1:10:2 1 3 5 7 9 2::-1 2 1 0.
- ::-1 all pages in reverse order.
- """
- class PageRange(object):
- """
- A slice-like representation of a range of page indices,
- i.e. page numbers, only starting at zero.
- The syntax is like what you would put between brackets [ ].
- The slice is one of the few Python types that can't be subclassed,
- but this class converts to and from slices, and allows similar use.
- o PageRange(str) parses a string representing a page range.
- o PageRange(slice) directly "imports" a slice.
- o to_slice() gives the equivalent slice.
- o str() and repr() allow printing.
- o indices(n) is like slice.indices(n).
- """
- def __init__(self, arg):
- """
- Initialize with either a slice -- giving the equivalent page range,
- or a PageRange object -- making a copy,
- or a string like
- "int", "[int]:[int]" or "[int]:[int]:[int]",
- where the brackets indicate optional ints.
- {page_range_help}
- Note the difference between this notation and arguments to slice():
- slice(3) means the first three pages;
- PageRange("3") means the range of only the fourth page.
- However PageRange(slice(3)) means the first three pages.
- """
- if isinstance(arg, slice):
- self._slice = arg
- return
- if isinstance(arg, PageRange):
- self._slice = arg.to_slice()
- return
- m = isString(arg) and re.match(PAGE_RANGE_RE, arg)
- if not m:
- raise ParseError(arg)
- elif m.group(2):
- # Special case: just an int means a range of one page.
- start = int(m.group(2))
- stop = start + 1 if start != -1 else None
- self._slice = slice(start, stop)
- else:
- self._slice = slice(*[int(g) if g else None
- for g in m.group(4, 6, 8)])
- # Just formatting this when there is __doc__ for __init__
- if __init__.__doc__:
- __init__.__doc__ = __init__.__doc__.format(page_range_help=PAGE_RANGE_HELP)
- @staticmethod
- def valid(input):
- """ True if input is a valid initializer for a PageRange. """
- return isinstance(input, slice) or \
- isinstance(input, PageRange) or \
- (isString(input)
- and bool(re.match(PAGE_RANGE_RE, input)))
- def to_slice(self):
- """ Return the slice equivalent of this page range. """
- return self._slice
- def __str__(self):
- """ A string like "1:2:3". """
- s = self._slice
- if s.step == None:
- if s.start != None and s.stop == s.start + 1:
- return str(s.start)
- indices = s.start, s.stop
- else:
- indices = s.start, s.stop, s.step
- return ':'.join("" if i == None else str(i) for i in indices)
- def __repr__(self):
- """ A string like "PageRange('1:2:3')". """
- return "PageRange(" + repr(str(self)) + ")"
- def indices(self, n):
- """
- n is the length of the list of pages to choose from.
- Returns arguments for range(). See help(slice.indices).
- """
- return self._slice.indices(n)
- PAGE_RANGE_ALL = PageRange(":") # The range of all pages.
- def parse_filename_page_ranges(args):
- """
- Given a list of filenames and page ranges, return a list of
- (filename, page_range) pairs.
- First arg must be a filename; other ags are filenames, page-range
- expressions, slice objects, or PageRange objects.
- A filename not followed by a page range indicates all pages of the file.
- """
- pairs = []
- pdf_filename = None
- did_page_range = False
- for arg in args + [None]:
- if PageRange.valid(arg):
- if not pdf_filename:
- raise ValueError("The first argument must be a filename, " \
- "not a page range.")
- pairs.append( (pdf_filename, PageRange(arg)) )
- did_page_range = True
- else:
- # New filename or end of list--do all of the previous file?
- if pdf_filename and not did_page_range:
- pairs.append( (pdf_filename, PAGE_RANGE_ALL) )
- pdf_filename = arg
- did_page_range = False
- return pairs
|