123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657 |
- '''
- Extract images from PDF without resampling or altering.
- Adapted from work by Sylvain Pelissier
- http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
- '''
- import sys
- import PyPDF2
- from PIL import Image
- if (len(sys.argv) != 2):
- print("\nUsage: python {} input_file\n".format(sys.argv[0]))
- sys.exit(1)
- pdf = sys.argv[1]
- if __name__ == '__main__':
- input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
- page0 = input1.getPage(30)
- if '/XObject' in page0['/Resources']:
- xObject = page0['/Resources']['/XObject'].getObject()
- for obj in xObject:
- if xObject[obj]['/Subtype'] == '/Image':
- size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
- data = xObject[obj].getData()
- if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
- mode = "RGB"
- else:
- mode = "P"
-
- if '/Filter' in xObject[obj]:
- if xObject[obj]['/Filter'] == '/FlateDecode':
- img = Image.frombytes(mode, size, data)
- if "/SMask" in xObject[obj]: # add alpha channel
- alpha = Image.frombytes("L", size, xObject[obj]["/SMask"].getData())
- img.putalpha(alpha)
- img.save(obj[1:] + ".png")
- elif xObject[obj]['/Filter'] == '/DCTDecode':
- img = open(obj[1:] + ".jpg", "wb")
- img.write(data)
- img.close()
- elif xObject[obj]['/Filter'] == '/JPXDecode':
- img = open(obj[1:] + ".jp2", "wb")
- img.write(data)
- img.close()
- elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
- img = open(obj[1:] + ".tiff", "wb")
- img.write(data)
- img.close()
- else:
- img = Image.frombytes(mode, size, data)
- img.save(obj[1:] + ".png")
- else:
- print("No image found.")
|