Add CCITTFax Decode and JPEG test
This commit is contained in:
parent
efae6bcae6
commit
1273824c0f
|
@ -331,10 +331,51 @@ class ASCII85Decode(object):
|
|||
return bytes(out)
|
||||
decode = staticmethod(decode)
|
||||
|
||||
|
||||
class DCTDecode(object):
|
||||
def decode(data, decodeParms=None):
|
||||
return data
|
||||
decode = staticmethod(decode)
|
||||
|
||||
class JPXDecode(object):
|
||||
def decode(data, decodeParms=None):
|
||||
return data
|
||||
decode = staticmethod(decode)
|
||||
|
||||
class CCITTFaxDecode(object):
|
||||
def decode(data, decodeParms=None, height=0):
|
||||
if decodeParms:
|
||||
if decodeParms.get("/K", 1) == -1:
|
||||
CCITTgroup = 4
|
||||
else:
|
||||
CCITTgroup = 3
|
||||
|
||||
width = decodeParms["/Columns"]
|
||||
imgSize = len(data)
|
||||
tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
|
||||
tiffHeader = struct.pack(tiff_header_struct,
|
||||
b'II', # Byte order indication: Little endian
|
||||
42, # Version number (always 42)
|
||||
8, # Offset to first IFD
|
||||
8, # Number of tags in IFD
|
||||
256, 4, 1, width, # ImageWidth, LONG, 1, width
|
||||
257, 4, 1, height, # ImageLength, LONG, 1, length
|
||||
258, 3, 1, 1, # BitsPerSample, SHORT, 1, 1
|
||||
259, 3, 1, CCITTgroup, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
|
||||
262, 3, 1, 0, # Thresholding, SHORT, 1, 0 = WhiteIsZero
|
||||
273, 4, 1, struct.calcsize(tiff_header_struct), # StripOffsets, LONG, 1, length of header
|
||||
278, 4, 1, height, # RowsPerStrip, LONG, 1, length
|
||||
279, 4, 1, imgSize, # StripByteCounts, LONG, 1, size of image
|
||||
0 # last IFD
|
||||
)
|
||||
|
||||
return tiffHeader + data
|
||||
|
||||
decode = staticmethod(decode)
|
||||
|
||||
def decodeStreamData(stream):
|
||||
from .generic import NameObject
|
||||
filters = stream.get("/Filter", ())
|
||||
|
||||
if len(filters) and not isinstance(filters[0], NameObject):
|
||||
# we have a single filter instance
|
||||
filters = (filters,)
|
||||
|
@ -350,9 +391,13 @@ def decodeStreamData(stream):
|
|||
data = LZWDecode.decode(data, stream.get("/DecodeParms"))
|
||||
elif filterType == "/ASCII85Decode" or filterType == "/A85":
|
||||
data = ASCII85Decode.decode(data)
|
||||
elif filterType == "/DCTDecode" or filterType == "/JPXDecode":
|
||||
#return raw data for jpg or jpeg2000 image
|
||||
pass
|
||||
elif filterType == "/DCTDecode":
|
||||
data = DCTDecode.decode(data)
|
||||
elif filterType == "/JPXDecode":
|
||||
data = JPXDecode.decode(data)
|
||||
elif filterType == "/CCITTFaxDecode":
|
||||
height = stream.get("/Height", ())
|
||||
data = CCITTFaxDecode.decode(data, stream.get("/DecodeParms"), height)
|
||||
elif filterType == "/Crypt":
|
||||
decodeParams = stream.get("/DecodeParams", {})
|
||||
if "/Name" not in decodeParams and "/Type" not in decodeParams:
|
||||
|
|
Binary file not shown.
File diff suppressed because one or more lines are too long
|
@ -17,14 +17,12 @@ pdf = sys.argv[1]
|
|||
|
||||
if __name__ == '__main__':
|
||||
input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
|
||||
page0 = input1.getPage(2)
|
||||
page0 = input1.getPage(30)
|
||||
|
||||
if '/XObject' in page0['/Resources']:
|
||||
xObject = page0['/Resources']['/XObject'].getObject()
|
||||
|
||||
print(xObject)
|
||||
for obj in xObject:
|
||||
print(xObject[obj])
|
||||
if xObject[obj]['/Subtype'] == '/Image':
|
||||
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
|
||||
data = xObject[obj].getData()
|
||||
|
@ -45,6 +43,10 @@ if __name__ == '__main__':
|
|||
img = open(obj[1:] + ".jp2", "wb")
|
||||
img.write(data)
|
||||
img.close()
|
||||
elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
|
||||
img = open(obj[1:] + ".tiff", "wb")
|
||||
img.write(data)
|
||||
img.close()
|
||||
else:
|
||||
img = Image.frombytes(mode, size, data)
|
||||
img.save(obj[1:] + ".png")
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
import sys
|
||||
import unittest
|
||||
import binascii
|
||||
|
||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||
|
||||
|
@ -37,6 +38,28 @@ class PdfReaderTestCases(unittest.TestCase):
|
|||
msg='PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n'
|
||||
% (pdftext, ipdf_p1_text))
|
||||
|
||||
def test_PdfReaderJpegImage(self):
|
||||
'''
|
||||
Test loading and parsing of a file. Extract the image of the file and compare to expected
|
||||
textual output. Expected outcome: file loads, image matches expected.
|
||||
'''
|
||||
|
||||
with open(os.path.join(RESOURCE_ROOT, 'jpeg.pdf'), 'rb') as inputfile:
|
||||
# Load PDF file from file
|
||||
ipdf = PdfFileReader(inputfile)
|
||||
|
||||
# Retrieve the text of the image
|
||||
with open(os.path.join(RESOURCE_ROOT, 'jpeg.txt'), 'r') as pdftext_file:
|
||||
imagetext = pdftext_file.read()
|
||||
|
||||
ipdf_p0 = ipdf.getPage(0)
|
||||
xObject = ipdf_p0['/Resources']['/XObject'].getObject()
|
||||
data = xObject['/Im4'].getData()
|
||||
|
||||
# Compare the text of the PDF to a known source
|
||||
self.assertEqual(binascii.hexlify(data), imagetext,
|
||||
msg='PDF extracted image differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n'
|
||||
% (imagetext, binascii.hexlify(data)))
|
||||
|
||||
class AddJsTestCase(unittest.TestCase):
|
||||
|
||||
|
|
Loading…
Reference in New Issue