"""
A custom EXIF parsing module, aimed at high performance.
"""
import struct, mmap, os
from utils import lazy_load, lazy_load_iter
def read_struct (file, fmt) :
"""
Utility function to read data from the a file using struct
"""
# length of data
fmt_size = struct.calcsize(fmt)
# get data
file_data = file.read(fmt_size)
# unpack single item, this should raise an error if file_data is too short
return struct.unpack(fmt, file_data)
class Buffer (object) :
"""
Wraps a buffer object (anything that supports the python buffer protocol) for read-only access.
Includes an offset for relative values, and an endianess for reading binary data.
"""
def __init__ (self, obj, offset=None, size=None, struct_prefix='=') :
"""
Create a new Buffer object with a new underlying buffer, created from the given object, offset and size.
The endiannes is given in the form of a struct-module prefix, which should be one of '<' or '>'.
Standard size/alignment are assumed.
"""
# store
self.buf = buffer(obj, *(arg for arg in (offset, size) if arg is not None))
self.offset = offset
self.size = size
self.prefix = struct_prefix
def subregion (self, offset, length=None) :
"""
Create a new sub-Buffer referencing a view of this buffer, at the given offset, and with the given
length, if any, and the same struct_prefix.
"""
return Buffer(self.buf, offset, length, struct_prefix=self.prefix)
def pread (self, offset, length) :
"""
Read a random-access region of raw data
"""
return self.buf[offset:offset + length]
def pread_struct (self, offset, fmt) :
"""
Read structured data using the given struct format from the given offset.
"""
return struct.unpack_from(self.prefix + fmt, self.buf, offset=offset)
def pread_item (self, offset, fmt) :
"""
Read a single item of structured data from the given offset.
"""
value, = self.pread_struct(offset, fmt)
return value
def iter_offsets (self, count, size, offset=0) :
"""
Yield a series of offsets for `count` items of `size` bytes, beginning at `offset`.
"""
return xrange(offset, offset + count * size, size)
def item_size (self, fmt) :
"""
Returns the size in bytes of the given item format
"""
return struct.calcsize(self.prefix + fmt)
def unpack_item (self, fmt, data) :
"""
Unpacks a single item from the given data
"""
value, = struct.unpack(self.prefix + fmt, data)
return value
def mmap_buffer (file, size) :
"""
Create and return a new read-only mmap'd region
"""
return mmap.mmap(file.fileno(), size, access=mmap.ACCESS_READ)
import exif_data
class Tag (object) :
"""
Represents a single Tag in an IFD
"""
def __init__ (self, ifd, offset, tag, type, count, data_raw) :
"""
Build a Tag with the given binary items from the IFD entry
"""
self.ifd = ifd
self.offset = offset
self.tag = tag
self.type = type
self.count = count
self.data_raw = data_raw
# lookup the type for this tag
self.type_data = exif_data.FIELD_TYPES.get(type)
# unpack it
if self.type_data :
self.type_format, self.type_name, self.type_func = self.type_data
# lookup the tag data for this tag
self.tag_data = self.ifd.tag_dict.get(tag)
@property
def name (self) :
"""
Lookup the name of this tag via its code, returns None if unknown.
"""
if self.tag_data :
return self.tag_data.name
else :
return None
def is_subifd (self) :
"""
Tests if this Tag is of a IFDTag type
"""
return self.tag_data and isinstance(self.tag_data, exif_data.IFDTag)
@lazy_load
def subifd (self) :
"""
Load the sub-IFD for this tag
"""
# the tag_dict to use
tag_dict = self.tag_data.ifd_tags or self.ifd.tag_dict
# construct, return
return self.ifd.exif._load_subifd(self, tag_dict)
def process_values (self, raw_values) :
"""
Process the given raw values unpacked from the file.
"""
if self.type_data and self.type_func :
# use the filter func
return self.type_func(raw_values)
else :
# nada, just leave them
return raw_values
def readable_value (self, values) :
"""
Convert the given values for this tag into a human-readable string.
Returns the comma-separated values by default.
"""
if self.tag_data :
# map it
return self.tag_data.map_values(values)
else :
# default value-mapping
return ", ".join(str(value) for value in values)
# size of an IFD entry in bytes
IFD_ENTRY_SIZE = 12
class IFD (Buffer) :
"""
Represents an IFD (Image file directory) region in EXIF data.
"""
def __init__ (self, exif, buffer, tag_dict, **buffer_opts) :
"""
Access the IFD data from the given bufferable object with given buffer opts.
This will read the `count` and `next_offset` values.
"""
# init
super(IFD, self).__init__(buffer, **buffer_opts)
# store
self.exif = exif
self.tag_dict = tag_dict
# read header
self.count = self.pread_item(0, 'H')
# read next-offset
self.next_offset = self.pread_item(0x02 + self.count * IFD_ENTRY_SIZE, 'I')
@lazy_load_iter
def tags (self) :
"""
Iterate over all the Tag objects in this IFD
"""
# read each tag
for offset in self.iter_offsets(self.count, IFD_ENTRY_SIZE, 0x02) :
# read the tag data
tag, type, count, data_raw = self.pread_struct(offset, 'HHI4s')
# yield the new Tag
yield Tag(self, self.offset + offset, tag, type, count, data_raw)
def get_tags (self, filter=None) :
"""
Yield a series of tag objects for this IFD and all sub-IFDs.
"""
for tag in self.tags :
if tag.is_subifd() :
# recurse
for subtag in tag.subifd.get_tags(filter=filter) :
yield subtag
else :
# normal tag
yield tag
class EXIF (Buffer) :
"""
Represents the EXIF data embedded in some image file in the form of a Region.
"""
def __init__ (self, buffer, **buffer_opts) :
"""
Access the EXIF data from the given bufferable object with the given buffer options.
"""
# init Buffer
super(EXIF, self).__init__(buffer, **buffer_opts)
# store
self.buffer = buffer
@lazy_load_iter
def ifds (self) :
"""
Iterate over the primary IFDs in this EXIF.
"""
# starting offset
offset = self.pread_item(0x04, 'I')
while offset :
# create and read the IFD, operating on the right sub-buffer
ifd = IFD(self, self.buf, exif_data.EXIF_TAGS, offset=offset)
# yield it
yield ifd
# skip to next offset
offset = ifd.next_offset
def _load_subifd (self, tag, tag_dict) :
"""
Creates and returns a sub-IFD for the given tag.
"""
# locate it
offset, = self.tag_values_raw(tag)
# construct the new IFD
return IFD(self, self.buf, tag_dict, offset=offset)
def tag_data_info (self, tag) :
"""
Calculate the location, format and size of the given tag's data.
Returns a (fmt, offset, size) tuple.
"""
# unknown tag?
if not tag.type_data :
return None
# data format
if len(tag.type_format) == 1 :
# let struct handle the count
fmt = "%d%s" % (tag.count, tag.type_format)
else :
# handle the count ourselves
fmt = tag.type_format * tag.count
# size of the data
size = self.item_size(fmt)
# inline or external?
if size > 0x04 :
# point at the external data
offset = self.unpack_item('I', tag.data_raw)
else :
# point at the inline data
offset = tag.offset + 0x08
return fmt, offset, size
def tag_values_raw (self, tag) :
"""
Get the raw values for the given tag as a tuple.
Returns None if the tag could not be recognized.
"""
# find the data
data_info = self.tag_data_info(tag)
# not found?
if not data_info :
return None
# unpack
data_fmt, data_offset, data_size = data_info
# read values
return self.pread_struct(data_offset, data_fmt)
def tag_values (self, tag) :
"""
Gets the processed values for the given tag as a list.
"""
# read + process
return tag.process_values(self.tag_values_raw(tag))
def tag_value (self, tag) :
"""
Return the human-readable string value for the given tag.
"""
# load the raw values
values = self.tag_values(tag)
# unknown?
if not values :
return ""
# return as comma-separated formatted string, yes
return tag.readable_value(values)
def get_main_tags (self, **opts) :
"""
Get the tags for the main image's IFD as a dict.
"""
if not self.ifds :
# weird case
raise Exception("No IFD for main image found")
# the main IFD is always the first one
main_ifd = self.ifds[0]
# do it
return dict((tag.name, self.tag_value(tag)) for tag in main_ifd.get_tags(**opts))
# mapping from two-byte TIFF byte order marker to struct prefix
TIFF_BYTE_ORDER = {
'II': '<',
'MM': '>',
}
# "An arbitrary but carefully chosen number (42) that further identifies the file as a TIFF file"
TIFF_BYTEORDER_MAGIC = 42
def tiff_load (file, length=0, **opts) :
"""
Load the Exif/TIFF data from the given file at its current position with optional length, using exif_load.
"""
# all Exif data offsets are relative to the beginning of this TIFF header
offset = file.tell()
# mmap the region for the EXIF data
buffer = mmap_buffer(file, length)
# read byte-order header
byte_order = file.read(2)
# map to struct prefix
struct_prefix = TIFF_BYTE_ORDER[byte_order]
# validate
check_value, = read_struct(file, struct_prefix + 'H')
if check_value != TIFF_BYTEORDER_MAGIC :
raise Exception("Invalid byte-order for TIFF: %2c -> %d" % (byte_order, check_value))
# build and return the EXIF object with the correct offset/size from the mmap region
return EXIF(buffer, offset=offset, size=length, **opts)
# the JPEG markers that don't have any data
JPEG_NOSIZE_MARKERS = (0xD8, 0xD9)
# the first marker in a JPEG File
JPEG_START_MARKER = 0xD8
# the JPEG APP1 marker used for EXIF
JPEG_EXIF_MARKER = 0xE1
# the JPEG APP1 Exif header
JPEG_EXIF_HEADER = "Exif\x00\x00"
def jpeg_markers (file) :
"""
Iterate over the JPEG markers in the given file, yielding (type_byte, size) tuples.
The size fields will be 0 for markers with no data. The file will be positioned at the beginning of the data
region, and may be seek'd around if needed.
XXX: find a real implementation of this somewhere?
"""
while True :
# read type
marker_byte, marker_type = read_struct(file, '!BB')
# validate
if marker_byte != 0xff :
raise Exception("Not a JPEG marker: %x%x" % (marker_byte, marker_type))
# special cases for no data
if marker_type in JPEG_NOSIZE_MARKERS :
size = 0
else :
# read size field
size, = read_struct(file, '!H')
# validate
if size < 0x02 :
raise Exception("Invalid size for marker %x%x: %x" % (marker_byte, marker_type, size))
else :
# do not count the size field itself
size = size - 2
# ok, data is at current position
offset = file.tell()
# yield
yield marker_type, size
# absolute seek to next marker
file.seek(offset + size)
def jpeg_find_exif (file) :
"""
Find the Exif/TIFF section in the given JPEG file.
If found, the file will be seek'd to the start of the Exif/TIFF header, and the size of the Exif/TIFF data will
be returned.
Returns None if no EXIF section was found.
"""
for count, (marker, size) in enumerate(jpeg_markers(file)) :
# verify that it's a JPEG file
if count == 0 :
# must start with the right marker
if marker != JPEG_START_MARKER :
raise Exception("JPEG file must start with 0xFF%02x marker" % (marker, ))
# look for APP1 marker (0xE1) with EXIF signature
elif marker == JPEG_EXIF_MARKER and file.read(len(JPEG_EXIF_HEADER)) == JPEG_EXIF_HEADER:
# skipped the initial Exif marker signature
return size - len(JPEG_EXIF_HEADER)
# nothing
return None
def jpeg_load (file, **opts) :
"""
Loads the embedded Exif TIFF data from the given JPEG file using tiff_load.
Returns None if no EXIF data could be found.
"""
# look for the right section
size = jpeg_find_exif(file)
# not found?
if not size :
# nothing
return
else :
# load it as TIFF data
return tiff_load(file, size, **opts)
def load_path (path, **opts) :
"""
Loads an EXIF object from the given filesystem path.
Returns None if it could not be parsed.
"""
# file extension
root, fext = os.path.splitext(path)
# map
func = {
'.jpeg': jpeg_load,
'.jpg': jpeg_load,
'.tiff': tiff_load, # XXX: untested
}.get(fext.lower())
# not recognized?
if not func :
# XXX: sniff the file
return None
# open it
file = open(path, 'rb')
# try and load it
return func(file, **opts)
def dump_tag (exif, i, tag, indent=2) :
"""
Dump the given tag
"""
data_info = exif.tag_data_info(tag)
if data_info :
data_fmt, data_offset, data_size = data_info
else :
data_fmt = data_offset = data_size = None
print "%sTag:%d offset=%#04x(%#08x), tag=%d/%s, type=%d/%s, count=%d, fmt=%s, offset=%#04x, size=%s, is_subifd=%s:" % (
'\t'*indent,
i,
tag.offset, tag.offset + exif.offset,
tag.tag, tag.name or '???',
tag.type, tag.type_name if tag.type_data else '???',
tag.count,
data_fmt, data_offset, data_size,
tag.is_subifd(),
)
if tag.is_subifd() :
# recurse
dump_ifd(exif, 0, tag.subifd, indent + 1)
else :
# dump each value
values = exif.tag_values(tag)
for i, value in enumerate(values) :
print "%s\t%02d: %.120r" % ('\t'*indent, i, value)
# and then the readable one
print "%s\t-> %.120s" % ('\t'*indent, tag.readable_value(values), )
def dump_ifd (exif, i, ifd, indent=1) :
"""
Dump the given IFD, recursively
"""
print "%sIFD:%d offset=%#04x(%#08x), count=%d, next=%d:" % (
'\t'*indent,
i,
ifd.offset, ifd.offset + exif.offset,
ifd.count,
ifd.next_offset
)
for i, tag in enumerate(ifd.tags) :
# dump
dump_tag(exif, i, tag, indent + 1)
def dump_exif (exif) :
"""
Dump all tags from the given EXIF object to stdout
"""
print "EXIF offset=%#08x, size=%d:" % (exif.offset, exif.size)
for i, ifd in enumerate(exif.ifds) :
# dump
dump_ifd(exif, i, ifd)
def list_tags (exif) :
"""
Print a neat listing of tags to stdout
"""
for k, v in exif.get_main_tags().iteritems() :
print "%30s: %s" % (k, v)
def main_path (path, dump) :
# dump path
print "%s: " % path
# try and load it
exif = load_path(path)
if not exif :
raise Exception("No EXIF data found")
if dump :
# dump everything
dump_exif(exif)
else :
# list them
list_tags(exif)
def main (paths, dump=False) :
"""
Load and dump EXIF data from the given path
"""
# handle each one
for path in paths :
main_path(path, dump=dump)
if __name__ == '__main__' :
import getopt
from sys import argv
# defaults
dump = False
# parse args
opts, args = getopt.getopt(argv[1:], "d", ["dump"])
for opt, val in opts :
if opt in ('-d', "--dump") :
dump = True
main(args, dump=dump)