terom@102: """ terom@102: A custom EXIF parsing module, aimed at high performance. terom@102: """ terom@102: terom@102: import struct, mmap, os terom@102: terom@105: from utils import lazy_load, lazy_load_iter terom@105: terom@102: def read_struct (file, fmt) : terom@102: """ terom@102: Utility function to read data from the a file using struct terom@102: """ terom@102: terom@102: # length of data terom@102: fmt_size = struct.calcsize(fmt) terom@102: terom@102: # get data terom@102: file_data = file.read(fmt_size) terom@102: terom@102: # unpack single item, this should raise an error if file_data is too short terom@102: return struct.unpack(fmt, file_data) terom@102: terom@102: class Buffer (object) : terom@102: """ terom@102: Wraps a buffer object (anything that supports the python buffer protocol) for read-only access. terom@102: terom@102: Includes an offset for relative values, and an endianess for reading binary data. terom@102: """ terom@102: terom@102: def __init__ (self, obj, offset=None, size=None, struct_prefix='=') : terom@102: """ terom@102: Create a new Buffer object with a new underlying buffer, created from the given object, offset and size. terom@102: terom@102: The endiannes is given in the form of a struct-module prefix, which should be one of '<' or '>'. terom@102: Standard size/alignment are assumed. terom@102: """ terom@102: terom@102: # store terom@103: self.buf = buffer(obj, *(arg for arg in (offset, size) if arg is not None)) terom@102: self.offset = offset terom@102: self.size = size terom@102: self.prefix = struct_prefix terom@102: terom@102: def subregion (self, offset, length=None) : terom@102: """ terom@102: Create a new sub-Buffer referencing a view of this buffer, at the given offset, and with the given terom@102: length, if any, and the same struct_prefix. terom@102: """ terom@102: terom@102: return Buffer(self.buf, offset, length, struct_prefix=self.prefix) terom@102: terom@102: def pread (self, offset, length) : terom@102: """ terom@102: Read a random-access region of raw data terom@102: """ terom@102: terom@102: return self.buf[offset:offset + length] terom@102: terom@102: def pread_struct (self, offset, fmt) : terom@102: """ terom@102: Read structured data using the given struct format from the given offset. terom@102: """ terom@102: terom@102: return struct.unpack_from(self.prefix + fmt, self.buf, offset=offset) terom@102: terom@102: def pread_item (self, offset, fmt) : terom@102: """ terom@102: Read a single item of structured data from the given offset. terom@102: """ terom@102: terom@102: value, = self.pread_struct(offset, fmt) terom@102: terom@102: return value terom@102: terom@102: def iter_offsets (self, count, size, offset=0) : terom@102: """ terom@102: Yield a series of offsets for `count` items of `size` bytes, beginning at `offset`. terom@102: """ terom@102: terom@102: return xrange(offset, offset + count * size, size) terom@102: terom@102: def item_size (self, fmt) : terom@102: """ terom@102: Returns the size in bytes of the given item format terom@102: """ terom@102: terom@102: return struct.calcsize(self.prefix + fmt) terom@102: terom@102: def unpack_item (self, fmt, data) : terom@102: """ terom@102: Unpacks a single item from the given data terom@102: """ terom@102: terom@102: value, = struct.unpack(self.prefix + fmt, data) terom@102: terom@102: return value terom@102: terom@102: def mmap_buffer (file, size) : terom@102: """ terom@102: Create and return a new read-only mmap'd region terom@102: """ terom@102: terom@102: return mmap.mmap(file.fileno(), size, access=mmap.ACCESS_READ) terom@102: terom@102: import exif_data terom@102: terom@102: class Tag (object) : terom@102: """ terom@102: Represents a single Tag in an IFD terom@102: """ terom@102: terom@105: def __init__ (self, ifd, offset, tag, type, count, data_raw) : terom@102: """ terom@102: Build a Tag with the given binary items from the IFD entry terom@102: """ terom@102: terom@105: self.ifd = ifd terom@102: self.offset = offset terom@102: self.tag = tag terom@102: self.type = type terom@102: self.count = count terom@103: self.data_raw = data_raw terom@102: terom@102: # lookup the type for this tag terom@102: self.type_data = exif_data.FIELD_TYPES.get(type) terom@102: terom@102: # unpack it terom@102: if self.type_data : terom@103: self.type_format, self.type_name, self.type_func = self.type_data terom@102: terom@102: # lookup the tag data for this tag terom@105: self.tag_data = self.ifd.tag_dict.get(tag) terom@102: terom@102: @property terom@102: def name (self) : terom@102: """ terom@102: Lookup the name of this tag via its code, returns None if unknown. terom@102: """ terom@102: terom@102: if self.tag_data : terom@105: return self.tag_data.name terom@102: terom@102: else : terom@102: return None terom@102: terom@106: def is_subifd (self) : terom@106: """ terom@106: Tests if this Tag is of a IFDTag type terom@106: """ terom@106: terom@106: return self.tag_data and isinstance(self.tag_data, exif_data.IFDTag) terom@106: terom@106: @lazy_load terom@106: def subifd (self) : terom@106: """ terom@106: Load the sub-IFD for this tag terom@106: """ terom@106: terom@106: # the tag_dict to use terom@106: tag_dict = self.tag_data.ifd_tags or self.ifd.tag_dict terom@106: terom@106: # construct, return terom@106: return self.ifd.exif._load_subifd(self, tag_dict) terom@106: terom@103: def process_values (self, raw_values) : terom@103: """ terom@103: Process the given raw values unpacked from the file. terom@103: """ terom@103: terom@103: if self.type_data and self.type_func : terom@103: # use the filter func terom@103: return self.type_func(raw_values) terom@103: terom@103: else : terom@103: # nada, just leave them terom@103: return raw_values terom@103: terom@104: def readable_value (self, values) : terom@102: """ terom@104: Convert the given values for this tag into a human-readable string. terom@102: terom@104: Returns the comma-separated values by default. terom@102: """ terom@102: terom@104: if self.tag_data : terom@105: # map it terom@105: return self.tag_data.map_values(values) terom@102: terom@102: else : terom@105: # default value-mapping terom@105: return ", ".join(str(value) for value in values) terom@102: terom@102: # size of an IFD entry in bytes terom@102: IFD_ENTRY_SIZE = 12 terom@102: terom@102: class IFD (Buffer) : terom@102: """ terom@102: Represents an IFD (Image file directory) region in EXIF data. terom@102: """ terom@102: terom@106: def __init__ (self, exif, buffer, tag_dict, **buffer_opts) : terom@102: """ terom@102: Access the IFD data from the given bufferable object with given buffer opts. terom@102: terom@102: This will read the `count` and `next_offset` values. terom@102: """ terom@102: terom@102: # init terom@102: super(IFD, self).__init__(buffer, **buffer_opts) terom@105: terom@105: # store terom@106: self.exif = exif terom@105: self.tag_dict = tag_dict terom@102: terom@102: # read header terom@102: self.count = self.pread_item(0, 'H') terom@102: terom@102: # read next-offset terom@102: self.next_offset = self.pread_item(0x02 + self.count * IFD_ENTRY_SIZE, 'I') terom@102: terom@105: @lazy_load_iter terom@105: def tags (self) : terom@102: """ terom@102: Iterate over all the Tag objects in this IFD terom@102: """ terom@102: terom@102: # read each tag terom@102: for offset in self.iter_offsets(self.count, IFD_ENTRY_SIZE, 0x02) : terom@102: # read the tag data terom@103: tag, type, count, data_raw = self.pread_struct(offset, 'HHI4s') terom@102: terom@102: # yield the new Tag terom@106: yield Tag(self, self.offset + offset, tag, type, count, data_raw) terom@106: terom@106: def get_tags (self, filter=None) : terom@106: """ terom@106: Yield a series of tag objects for this IFD and all sub-IFDs. terom@106: """ terom@106: terom@106: for tag in self.tags : terom@106: if tag.is_subifd() : terom@106: # recurse terom@106: for subtag in tag.subifd.get_tags(filter=filter) : terom@106: yield subtag terom@106: terom@106: else : terom@106: # normal tag terom@106: yield tag terom@102: terom@102: class EXIF (Buffer) : terom@102: """ terom@102: Represents the EXIF data embedded in some image file in the form of a Region. terom@102: """ terom@102: terom@106: def __init__ (self, buffer, **buffer_opts) : terom@102: """ terom@102: Access the EXIF data from the given bufferable object with the given buffer options. terom@102: """ terom@102: terom@102: # init Buffer terom@102: super(EXIF, self).__init__(buffer, **buffer_opts) terom@102: terom@102: # store terom@102: self.buffer = buffer terom@102: terom@105: @lazy_load_iter terom@105: def ifds (self) : terom@102: """ terom@104: Iterate over the primary IFDs in this EXIF. terom@102: """ terom@102: terom@102: # starting offset terom@102: offset = self.pread_item(0x04, 'I') terom@102: terom@102: while offset : terom@103: # create and read the IFD, operating on the right sub-buffer terom@106: ifd = IFD(self, self.buf, exif_data.EXIF_TAGS, offset=offset) terom@102: terom@102: # yield it terom@102: yield ifd terom@102: terom@102: # skip to next offset terom@102: offset = ifd.next_offset terom@102: terom@106: def _load_subifd (self, tag, tag_dict) : terom@104: """ terom@106: Creates and returns a sub-IFD for the given tag. terom@104: """ terom@106: terom@106: # locate it terom@106: offset, = self.tag_values_raw(tag) terom@106: terom@106: # construct the new IFD terom@106: return IFD(self, self.buf, tag_dict, offset=offset) terom@106: terom@103: def tag_data_info (self, tag) : terom@103: """ terom@103: Calculate the location, format and size of the given tag's data. terom@103: terom@103: Returns a (fmt, offset, size) tuple. terom@103: """ terom@103: # unknown tag? terom@103: if not tag.type_data : terom@103: return None terom@103: terom@103: # data format terom@103: if len(tag.type_format) == 1 : terom@103: # let struct handle the count terom@103: fmt = "%d%s" % (tag.count, tag.type_format) terom@103: terom@103: else : terom@103: # handle the count ourselves terom@103: fmt = tag.type_format * tag.count terom@103: terom@103: # size of the data terom@103: size = self.item_size(fmt) terom@103: terom@103: # inline or external? terom@103: if size > 0x04 : terom@103: # point at the external data terom@103: offset = self.unpack_item('I', tag.data_raw) terom@103: terom@103: else : terom@103: # point at the inline data terom@103: offset = tag.offset + 0x08 terom@103: terom@103: return fmt, offset, size terom@103: terom@103: def tag_values_raw (self, tag) : terom@102: """ terom@102: Get the raw values for the given tag as a tuple. terom@102: terom@102: Returns None if the tag could not be recognized. terom@102: """ terom@102: terom@103: # find the data terom@103: data_info = self.tag_data_info(tag) terom@102: terom@103: # not found? terom@103: if not data_info : terom@103: return None terom@103: terom@103: # unpack terom@103: data_fmt, data_offset, data_size = data_info terom@102: terom@102: # read values terom@103: return self.pread_struct(data_offset, data_fmt) terom@102: terom@103: def tag_values (self, tag) : terom@103: """ terom@103: Gets the processed values for the given tag as a list. terom@103: """ terom@103: terom@103: # read + process terom@103: return tag.process_values(self.tag_values_raw(tag)) terom@103: terom@102: def tag_value (self, tag) : terom@102: """ terom@102: Return the human-readable string value for the given tag. terom@102: """ terom@102: terom@102: # load the raw values terom@102: values = self.tag_values(tag) terom@102: terom@102: # unknown? terom@102: if not values : terom@102: return "" terom@102: terom@102: # return as comma-separated formatted string, yes terom@104: return tag.readable_value(values) terom@106: terom@106: def get_main_tags (self, **opts) : terom@106: """ terom@106: Get the tags for the main image's IFD as a dict. terom@106: """ terom@106: terom@106: if not self.ifds : terom@106: # weird case terom@106: raise Exception("No IFD for main image found") terom@106: terom@106: # the main IFD is always the first one terom@106: main_ifd = self.ifds[0] terom@106: terom@106: # do it terom@106: return dict((tag.name, self.tag_value(tag)) for tag in main_ifd.get_tags(**opts)) terom@102: terom@102: # mapping from two-byte TIFF byte order marker to struct prefix terom@102: TIFF_BYTE_ORDER = { terom@102: 'II': '<', terom@102: 'MM': '>', terom@102: } terom@102: terom@102: # "An arbitrary but carefully chosen number (42) that further identifies the file as a TIFF file" terom@102: TIFF_BYTEORDER_MAGIC = 42 terom@102: terom@102: def tiff_load (file, length=0, **opts) : terom@102: """ terom@102: Load the Exif/TIFF data from the given file at its current position with optional length, using exif_load. terom@102: """ terom@102: terom@102: # all Exif data offsets are relative to the beginning of this TIFF header terom@102: offset = file.tell() terom@102: terom@102: # mmap the region for the EXIF data terom@103: buffer = mmap_buffer(file, length) terom@102: terom@102: # read byte-order header terom@102: byte_order = file.read(2) terom@102: terom@102: # map to struct prefix terom@102: struct_prefix = TIFF_BYTE_ORDER[byte_order] terom@102: terom@102: # validate terom@102: check_value, = read_struct(file, struct_prefix + 'H') terom@102: terom@102: if check_value != TIFF_BYTEORDER_MAGIC : terom@102: raise Exception("Invalid byte-order for TIFF: %2c -> %d" % (byte_order, check_value)) terom@102: terom@102: # build and return the EXIF object with the correct offset/size from the mmap region terom@102: return EXIF(buffer, offset=offset, size=length, **opts) terom@102: terom@102: # the JPEG markers that don't have any data terom@102: JPEG_NOSIZE_MARKERS = (0xD8, 0xD9) terom@102: terom@102: # the first marker in a JPEG File terom@102: JPEG_START_MARKER = 0xD8 terom@102: terom@102: # the JPEG APP1 marker used for EXIF terom@102: JPEG_EXIF_MARKER = 0xE1 terom@102: terom@102: # the JPEG APP1 Exif header terom@102: JPEG_EXIF_HEADER = "Exif\x00\x00" terom@102: terom@102: def jpeg_markers (file) : terom@102: """ terom@102: Iterate over the JPEG markers in the given file, yielding (type_byte, size) tuples. terom@102: terom@102: The size fields will be 0 for markers with no data. The file will be positioned at the beginning of the data terom@102: region, and may be seek'd around if needed. terom@102: terom@102: XXX: find a real implementation of this somewhere? terom@102: """ terom@102: terom@102: while True : terom@102: # read type terom@102: marker_byte, marker_type = read_struct(file, '!BB') terom@102: terom@102: # validate terom@102: if marker_byte != 0xff : terom@102: raise Exception("Not a JPEG marker: %x%x" % (marker_byte, marker_type)) terom@102: terom@102: # special cases for no data terom@103: if marker_type in JPEG_NOSIZE_MARKERS : terom@102: size = 0 terom@102: terom@102: else : terom@102: # read size field terom@102: size, = read_struct(file, '!H') terom@102: terom@102: # validate terom@102: if size < 0x02 : terom@102: raise Exception("Invalid size for marker %x%x: %x" % (marker_byte, marker_type, size)) terom@102: terom@102: else : terom@102: # do not count the size field itself terom@102: size = size - 2 terom@102: terom@102: # ok, data is at current position terom@102: offset = file.tell() terom@102: terom@102: # yield terom@102: yield marker_type, size terom@102: terom@102: # absolute seek to next marker terom@102: file.seek(offset + size) terom@102: terom@102: def jpeg_find_exif (file) : terom@102: """ terom@102: Find the Exif/TIFF section in the given JPEG file. terom@102: terom@102: If found, the file will be seek'd to the start of the Exif/TIFF header, and the size of the Exif/TIFF data will terom@102: be returned. terom@102: terom@102: Returns None if no EXIF section was found. terom@102: """ terom@102: terom@102: for count, (marker, size) in enumerate(jpeg_markers(file)) : terom@102: # verify that it's a JPEG file terom@102: if count == 0 : terom@102: # must start with the right marker terom@102: if marker != JPEG_START_MARKER : terom@102: raise Exception("JPEG file must start with 0xFF%02x marker" % (marker, )) terom@102: terom@102: # look for APP1 marker (0xE1) with EXIF signature terom@102: elif marker == JPEG_EXIF_MARKER and file.read(len(JPEG_EXIF_HEADER)) == JPEG_EXIF_HEADER: terom@102: # skipped the initial Exif marker signature terom@103: return size - len(JPEG_EXIF_HEADER) terom@102: terom@102: # nothing terom@102: return None terom@102: terom@102: def jpeg_load (file, **opts) : terom@102: """ terom@102: Loads the embedded Exif TIFF data from the given JPEG file using tiff_load. terom@102: terom@102: Returns None if no EXIF data could be found. terom@102: """ terom@102: terom@102: # look for the right section terom@102: size = jpeg_find_exif(file) terom@102: terom@102: # not found? terom@103: if not size : terom@102: # nothing terom@102: return terom@102: terom@102: else : terom@102: # load it as TIFF data terom@102: return tiff_load(file, size, **opts) terom@102: terom@102: def load_path (path, **opts) : terom@102: """ terom@102: Loads an EXIF object from the given filesystem path. terom@102: terom@102: Returns None if it could not be parsed. terom@102: """ terom@102: terom@102: # file extension terom@102: root, fext = os.path.splitext(path) terom@102: terom@102: # map terom@102: func = { terom@102: '.jpeg': jpeg_load, terom@102: '.jpg': jpeg_load, terom@102: '.tiff': tiff_load, # XXX: untested terom@102: }.get(fext.lower()) terom@102: terom@102: # not recognized? terom@102: if not func : terom@102: # XXX: sniff the file terom@102: return None terom@102: terom@102: # open it terom@102: file = open(path, 'rb') terom@102: terom@102: # try and load it terom@102: return func(file, **opts) terom@102: terom@106: def dump_tag (exif, i, tag, indent=2) : terom@106: """ terom@106: Dump the given tag terom@106: """ terom@106: terom@106: data_info = exif.tag_data_info(tag) terom@106: terom@106: if data_info : terom@106: data_fmt, data_offset, data_size = data_info terom@106: terom@106: else : terom@106: data_fmt = data_offset = data_size = None terom@106: terom@106: print "%sTag:%d offset=%#04x(%#08x), tag=%d/%s, type=%d/%s, count=%d, fmt=%s, offset=%#04x, size=%s, is_subifd=%s:" % ( terom@106: '\t'*indent, terom@106: i, terom@106: tag.offset, tag.offset + exif.offset, terom@106: tag.tag, tag.name or '???', terom@106: tag.type, tag.type_name if tag.type_data else '???', terom@106: tag.count, terom@106: data_fmt, data_offset, data_size, terom@106: tag.is_subifd(), terom@106: ) terom@106: terom@106: if tag.is_subifd() : terom@106: # recurse terom@106: dump_ifd(exif, 0, tag.subifd, indent + 1) terom@106: terom@106: else : terom@106: # dump each value terom@106: values = exif.tag_values(tag) terom@106: terom@106: for i, value in enumerate(values) : terom@106: print "%s\t%02d: %.120r" % ('\t'*indent, i, value) terom@106: terom@106: # and then the readable one terom@106: print "%s\t-> %.120s" % ('\t'*indent, tag.readable_value(values), ) terom@106: terom@106: terom@106: def dump_ifd (exif, i, ifd, indent=1) : terom@106: """ terom@106: Dump the given IFD, recursively terom@106: """ terom@106: terom@106: print "%sIFD:%d offset=%#04x(%#08x), count=%d, next=%d:" % ( terom@106: '\t'*indent, terom@106: i, terom@106: ifd.offset, ifd.offset + exif.offset, terom@106: ifd.count, terom@106: ifd.next_offset terom@106: ) terom@106: terom@106: for i, tag in enumerate(ifd.tags) : terom@106: # dump terom@106: dump_tag(exif, i, tag, indent + 1) terom@106: terom@106: terom@102: def dump_exif (exif) : terom@102: """ terom@102: Dump all tags from the given EXIF object to stdout terom@102: """ terom@102: terom@103: print "EXIF offset=%#08x, size=%d:" % (exif.offset, exif.size) terom@102: terom@105: for i, ifd in enumerate(exif.ifds) : terom@106: # dump terom@106: dump_ifd(exif, i, ifd) terom@103: terom@104: terom@106: def list_tags (exif) : terom@106: """ terom@106: Print a neat listing of tags to stdout terom@106: """ terom@102: terom@106: for k, v in exif.get_main_tags().iteritems() : terom@106: print "%30s: %s" % (k, v) terom@104: terom@107: def main_path (path, dump) : terom@107: # dump path terom@107: print "%s: " % path terom@107: terom@107: # try and load it terom@107: exif = load_path(path) terom@107: terom@107: if not exif : terom@107: raise Exception("No EXIF data found") terom@107: terom@107: if dump : terom@107: # dump everything terom@107: dump_exif(exif) terom@107: terom@107: else : terom@107: # list them terom@107: list_tags(exif) terom@107: terom@107: terom@107: def main (paths, dump=False) : terom@102: """ terom@102: Load and dump EXIF data from the given path terom@102: """ terom@102: terom@107: # handle each one terom@107: for path in paths : terom@107: main_path(path, dump=dump) terom@102: terom@102: if __name__ == '__main__' : terom@107: import getopt terom@102: from sys import argv terom@107: terom@107: # defaults terom@107: dump = False terom@102: terom@107: # parse args terom@107: opts, args = getopt.getopt(argv[1:], "d", ["dump"]) terom@102: terom@107: for opt, val in opts : terom@107: if opt in ('-d', "--dump") : terom@107: dump = True terom@107: terom@107: main(args, dump=dump) terom@107: