pdf-image-extractor.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. '''
  2. Extract images from PDF without resampling or altering.
  3. Adapted from work by Sylvain Pelissier
  4. http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
  5. '''
  6. import sys
  7. import PyPDF2
  8. from PIL import Image
  9. if (len(sys.argv) != 2):
  10. print("\nUsage: python {} input_file\n".format(sys.argv[0]))
  11. sys.exit(1)
  12. pdf = sys.argv[1]
  13. if __name__ == '__main__':
  14. input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
  15. page0 = input1.getPage(30)
  16. if '/XObject' in page0['/Resources']:
  17. xObject = page0['/Resources']['/XObject'].getObject()
  18. for obj in xObject:
  19. if xObject[obj]['/Subtype'] == '/Image':
  20. size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
  21. data = xObject[obj].getData()
  22. if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
  23. mode = "RGB"
  24. else:
  25. mode = "P"
  26. if '/Filter' in xObject[obj]:
  27. if xObject[obj]['/Filter'] == '/FlateDecode':
  28. img = Image.frombytes(mode, size, data)
  29. if "/SMask" in xObject[obj]: # add alpha channel
  30. alpha = Image.frombytes("L", size, xObject[obj]["/SMask"].getData())
  31. img.putalpha(alpha)
  32. img.save(obj[1:] + ".png")
  33. elif xObject[obj]['/Filter'] == '/DCTDecode':
  34. img = open(obj[1:] + ".jpg", "wb")
  35. img.write(data)
  36. img.close()
  37. elif xObject[obj]['/Filter'] == '/JPXDecode':
  38. img = open(obj[1:] + ".jp2", "wb")
  39. img.write(data)
  40. img.close()
  41. elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
  42. img = open(obj[1:] + ".tiff", "wb")
  43. img.write(data)
  44. img.close()
  45. else:
  46. img = Image.frombytes(mode, size, data)
  47. img.save(obj[1:] + ".png")
  48. else:
  49. print("No image found.")