diff -r 2e2ef5c99985 -r f74d8cf678ce degal/lib/exif.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/degal/lib/exif.py Sun Jun 14 16:10:30 2009 +0300 @@ -0,0 +1,669 @@ +""" + A custom EXIF parsing module, aimed at high performance. +""" + +import struct, mmap, os + +from utils import lazy_load, lazy_load_iter + +def read_struct (file, fmt) : + """ + Utility function to read data from the a file using struct + """ + + # length of data + fmt_size = struct.calcsize(fmt) + + # get data + file_data = file.read(fmt_size) + + # unpack single item, this should raise an error if file_data is too short + return struct.unpack(fmt, file_data) + +class Buffer (object) : + """ + Wraps a buffer object (anything that supports the python buffer protocol) for read-only access. + + Includes an offset for relative values, and an endianess for reading binary data. + """ + + def __init__ (self, obj, offset=None, size=None, struct_prefix='=') : + """ + Create a new Buffer object with a new underlying buffer, created from the given object, offset and size. + + The endiannes is given in the form of a struct-module prefix, which should be one of '<' or '>'. + Standard size/alignment are assumed. + """ + + # store + self.buf = buffer(obj, *(arg for arg in (offset, size) if arg is not None)) + self.offset = offset + self.size = size + self.prefix = struct_prefix + + def subregion (self, offset, length=None) : + """ + Create a new sub-Buffer referencing a view of this buffer, at the given offset, and with the given + length, if any, and the same struct_prefix. + """ + + return Buffer(self.buf, offset, length, struct_prefix=self.prefix) + + def pread (self, offset, length) : + """ + Read a random-access region of raw data + """ + + return self.buf[offset:offset + length] + + def pread_struct (self, offset, fmt) : + """ + Read structured data using the given struct format from the given offset. + """ + + return struct.unpack_from(self.prefix + fmt, self.buf, offset=offset) + + def pread_item (self, offset, fmt) : + """ + Read a single item of structured data from the given offset. + """ + + value, = self.pread_struct(offset, fmt) + + return value + + def iter_offsets (self, count, size, offset=0) : + """ + Yield a series of offsets for `count` items of `size` bytes, beginning at `offset`. + """ + + return xrange(offset, offset + count * size, size) + + def item_size (self, fmt) : + """ + Returns the size in bytes of the given item format + """ + + return struct.calcsize(self.prefix + fmt) + + def unpack_item (self, fmt, data) : + """ + Unpacks a single item from the given data + """ + + value, = struct.unpack(self.prefix + fmt, data) + + return value + +def mmap_buffer (file, size) : + """ + Create and return a new read-only mmap'd region + """ + + return mmap.mmap(file.fileno(), size, access=mmap.ACCESS_READ) + +import exif_data + +class Tag (object) : + """ + Represents a single Tag in an IFD + """ + + def __init__ (self, ifd, offset, tag, type, count, data_raw) : + """ + Build a Tag with the given binary items from the IFD entry + """ + + self.ifd = ifd + self.offset = offset + self.tag = tag + self.type = type + self.count = count + self.data_raw = data_raw + + # lookup the type for this tag + self.type_data = exif_data.FIELD_TYPES.get(type) + + # unpack it + if self.type_data : + self.type_format, self.type_name, self.type_func = self.type_data + + # lookup the tag data for this tag + self.tag_data = self.ifd.tag_dict.get(tag) + + @property + def name (self) : + """ + Lookup the name of this tag via its code, returns None if unknown. + """ + + if self.tag_data : + return self.tag_data.name + + else : + return None + + def is_subifd (self) : + """ + Tests if this Tag is of a IFDTag type + """ + + return self.tag_data and isinstance(self.tag_data, exif_data.IFDTag) + + @lazy_load + def subifd (self) : + """ + Load the sub-IFD for this tag + """ + + # the tag_dict to use + tag_dict = self.tag_data.ifd_tags or self.ifd.tag_dict + + # construct, return + return self.ifd.exif._load_subifd(self, tag_dict) + + def process_values (self, raw_values) : + """ + Process the given raw values unpacked from the file. + """ + + if self.type_data and self.type_func : + # use the filter func + return self.type_func(raw_values) + + else : + # nada, just leave them + return raw_values + + def readable_value (self, values) : + """ + Convert the given values for this tag into a human-readable string. + + Returns the comma-separated values by default. + """ + + if self.tag_data : + # map it + return self.tag_data.map_values(values) + + else : + # default value-mapping + return ", ".join(str(value) for value in values) + +# size of an IFD entry in bytes +IFD_ENTRY_SIZE = 12 + +class IFD (Buffer) : + """ + Represents an IFD (Image file directory) region in EXIF data. + """ + + def __init__ (self, exif, buffer, tag_dict, **buffer_opts) : + """ + Access the IFD data from the given bufferable object with given buffer opts. + + This will read the `count` and `next_offset` values. + """ + + # init + super(IFD, self).__init__(buffer, **buffer_opts) + + # store + self.exif = exif + self.tag_dict = tag_dict + + # read header + self.count = self.pread_item(0, 'H') + + # read next-offset + self.next_offset = self.pread_item(0x02 + self.count * IFD_ENTRY_SIZE, 'I') + + @lazy_load_iter + def tags (self) : + """ + Iterate over all the Tag objects in this IFD + """ + + # read each tag + for offset in self.iter_offsets(self.count, IFD_ENTRY_SIZE, 0x02) : + # read the tag data + tag, type, count, data_raw = self.pread_struct(offset, 'HHI4s') + + # yield the new Tag + yield Tag(self, self.offset + offset, tag, type, count, data_raw) + + def get_tags (self, filter=None) : + """ + Yield a series of tag objects for this IFD and all sub-IFDs. + """ + + for tag in self.tags : + if tag.is_subifd() : + # recurse + for subtag in tag.subifd.get_tags(filter=filter) : + yield subtag + + else : + # normal tag + yield tag + +class EXIF (Buffer) : + """ + Represents the EXIF data embedded in some image file in the form of a Region. + """ + + def __init__ (self, buffer, **buffer_opts) : + """ + Access the EXIF data from the given bufferable object with the given buffer options. + """ + + # init Buffer + super(EXIF, self).__init__(buffer, **buffer_opts) + + # store + self.buffer = buffer + + @lazy_load_iter + def ifds (self) : + """ + Iterate over the primary IFDs in this EXIF. + """ + + # starting offset + offset = self.pread_item(0x04, 'I') + + while offset : + # create and read the IFD, operating on the right sub-buffer + ifd = IFD(self, self.buf, exif_data.EXIF_TAGS, offset=offset) + + # yield it + yield ifd + + # skip to next offset + offset = ifd.next_offset + + def _load_subifd (self, tag, tag_dict) : + """ + Creates and returns a sub-IFD for the given tag. + """ + + # locate it + offset, = self.tag_values_raw(tag) + + # construct the new IFD + return IFD(self, self.buf, tag_dict, offset=offset) + + def tag_data_info (self, tag) : + """ + Calculate the location, format and size of the given tag's data. + + Returns a (fmt, offset, size) tuple. + """ + # unknown tag? + if not tag.type_data : + return None + + # data format + if len(tag.type_format) == 1 : + # let struct handle the count + fmt = "%d%s" % (tag.count, tag.type_format) + + else : + # handle the count ourselves + fmt = tag.type_format * tag.count + + # size of the data + size = self.item_size(fmt) + + # inline or external? + if size > 0x04 : + # point at the external data + offset = self.unpack_item('I', tag.data_raw) + + else : + # point at the inline data + offset = tag.offset + 0x08 + + return fmt, offset, size + + def tag_values_raw (self, tag) : + """ + Get the raw values for the given tag as a tuple. + + Returns None if the tag could not be recognized. + """ + + # find the data + data_info = self.tag_data_info(tag) + + # not found? + if not data_info : + return None + + # unpack + data_fmt, data_offset, data_size = data_info + + # read values + return self.pread_struct(data_offset, data_fmt) + + def tag_values (self, tag) : + """ + Gets the processed values for the given tag as a list. + """ + + # read + process + return tag.process_values(self.tag_values_raw(tag)) + + def tag_value (self, tag) : + """ + Return the human-readable string value for the given tag. + """ + + # load the raw values + values = self.tag_values(tag) + + # unknown? + if not values : + return "" + + # return as comma-separated formatted string, yes + return tag.readable_value(values) + + def get_main_tags (self, **opts) : + """ + Get the tags for the main image's IFD as a dict. + """ + + if not self.ifds : + # weird case + raise Exception("No IFD for main image found") + + # the main IFD is always the first one + main_ifd = self.ifds[0] + + # do it + return dict((tag.name, self.tag_value(tag)) for tag in main_ifd.get_tags(**opts)) + +# mapping from two-byte TIFF byte order marker to struct prefix +TIFF_BYTE_ORDER = { + 'II': '<', + 'MM': '>', +} + +# "An arbitrary but carefully chosen number (42) that further identifies the file as a TIFF file" +TIFF_BYTEORDER_MAGIC = 42 + +def tiff_load (file, length=0, **opts) : + """ + Load the Exif/TIFF data from the given file at its current position with optional length, using exif_load. + """ + + # all Exif data offsets are relative to the beginning of this TIFF header + offset = file.tell() + + # mmap the region for the EXIF data + buffer = mmap_buffer(file, length) + + # read byte-order header + byte_order = file.read(2) + + # map to struct prefix + struct_prefix = TIFF_BYTE_ORDER[byte_order] + + # validate + check_value, = read_struct(file, struct_prefix + 'H') + + if check_value != TIFF_BYTEORDER_MAGIC : + raise Exception("Invalid byte-order for TIFF: %2c -> %d" % (byte_order, check_value)) + + # build and return the EXIF object with the correct offset/size from the mmap region + return EXIF(buffer, offset=offset, size=length, **opts) + +# the JPEG markers that don't have any data +JPEG_NOSIZE_MARKERS = (0xD8, 0xD9) + +# the first marker in a JPEG File +JPEG_START_MARKER = 0xD8 + +# the JPEG APP1 marker used for EXIF +JPEG_EXIF_MARKER = 0xE1 + +# the JPEG APP1 Exif header +JPEG_EXIF_HEADER = "Exif\x00\x00" + +def jpeg_markers (file) : + """ + Iterate over the JPEG markers in the given file, yielding (type_byte, size) tuples. + + The size fields will be 0 for markers with no data. The file will be positioned at the beginning of the data + region, and may be seek'd around if needed. + + XXX: find a real implementation of this somewhere? + """ + + while True : + # read type + marker_byte, marker_type = read_struct(file, '!BB') + + # validate + if marker_byte != 0xff : + raise Exception("Not a JPEG marker: %x%x" % (marker_byte, marker_type)) + + # special cases for no data + if marker_type in JPEG_NOSIZE_MARKERS : + size = 0 + + else : + # read size field + size, = read_struct(file, '!H') + + # validate + if size < 0x02 : + raise Exception("Invalid size for marker %x%x: %x" % (marker_byte, marker_type, size)) + + else : + # do not count the size field itself + size = size - 2 + + # ok, data is at current position + offset = file.tell() + + # yield + yield marker_type, size + + # absolute seek to next marker + file.seek(offset + size) + +def jpeg_find_exif (file) : + """ + Find the Exif/TIFF section in the given JPEG file. + + If found, the file will be seek'd to the start of the Exif/TIFF header, and the size of the Exif/TIFF data will + be returned. + + Returns None if no EXIF section was found. + """ + + for count, (marker, size) in enumerate(jpeg_markers(file)) : + # verify that it's a JPEG file + if count == 0 : + # must start with the right marker + if marker != JPEG_START_MARKER : + raise Exception("JPEG file must start with 0xFF%02x marker" % (marker, )) + + # look for APP1 marker (0xE1) with EXIF signature + elif marker == JPEG_EXIF_MARKER and file.read(len(JPEG_EXIF_HEADER)) == JPEG_EXIF_HEADER: + # skipped the initial Exif marker signature + return size - len(JPEG_EXIF_HEADER) + + # nothing + return None + +def jpeg_load (file, **opts) : + """ + Loads the embedded Exif TIFF data from the given JPEG file using tiff_load. + + Returns None if no EXIF data could be found. + """ + + # look for the right section + size = jpeg_find_exif(file) + + # not found? + if not size : + # nothing + return + + else : + # load it as TIFF data + return tiff_load(file, size, **opts) + +def load_path (path, **opts) : + """ + Loads an EXIF object from the given filesystem path. + + Returns None if it could not be parsed. + """ + + # file extension + root, fext = os.path.splitext(path) + + # map + func = { + '.jpeg': jpeg_load, + '.jpg': jpeg_load, + '.tiff': tiff_load, # XXX: untested + }.get(fext.lower()) + + # not recognized? + if not func : + # XXX: sniff the file + return None + + # open it + file = open(path, 'rb') + + # try and load it + return func(file, **opts) + +def dump_tag (exif, i, tag, indent=2) : + """ + Dump the given tag + """ + + data_info = exif.tag_data_info(tag) + + if data_info : + data_fmt, data_offset, data_size = data_info + + else : + data_fmt = data_offset = data_size = None + + print "%sTag:%d offset=%#04x(%#08x), tag=%d/%s, type=%d/%s, count=%d, fmt=%s, offset=%#04x, size=%s, is_subifd=%s:" % ( + '\t'*indent, + i, + tag.offset, tag.offset + exif.offset, + tag.tag, tag.name or '???', + tag.type, tag.type_name if tag.type_data else '???', + tag.count, + data_fmt, data_offset, data_size, + tag.is_subifd(), + ) + + if tag.is_subifd() : + # recurse + dump_ifd(exif, 0, tag.subifd, indent + 1) + + else : + # dump each value + values = exif.tag_values(tag) + + for i, value in enumerate(values) : + print "%s\t%02d: %.120r" % ('\t'*indent, i, value) + + # and then the readable one + print "%s\t-> %.120s" % ('\t'*indent, tag.readable_value(values), ) + + +def dump_ifd (exif, i, ifd, indent=1) : + """ + Dump the given IFD, recursively + """ + + print "%sIFD:%d offset=%#04x(%#08x), count=%d, next=%d:" % ( + '\t'*indent, + i, + ifd.offset, ifd.offset + exif.offset, + ifd.count, + ifd.next_offset + ) + + for i, tag in enumerate(ifd.tags) : + # dump + dump_tag(exif, i, tag, indent + 1) + + +def dump_exif (exif) : + """ + Dump all tags from the given EXIF object to stdout + """ + + print "EXIF offset=%#08x, size=%d:" % (exif.offset, exif.size) + + for i, ifd in enumerate(exif.ifds) : + # dump + dump_ifd(exif, i, ifd) + + +def list_tags (exif) : + """ + Print a neat listing of tags to stdout + """ + + for k, v in exif.get_main_tags().iteritems() : + print "%30s: %s" % (k, v) + +def main_path (path, dump) : + # dump path + print "%s: " % path + + # try and load it + exif = load_path(path) + + if not exif : + raise Exception("No EXIF data found") + + if dump : + # dump everything + dump_exif(exif) + + else : + # list them + list_tags(exif) + + +def main (paths, dump=False) : + """ + Load and dump EXIF data from the given path + """ + + # handle each one + for path in paths : + main_path(path, dump=dump) + +if __name__ == '__main__' : + import getopt + from sys import argv + + # defaults + dump = False + + # parse args + opts, args = getopt.getopt(argv[1:], "d", ["dump"]) + + for opt, val in opts : + if opt in ('-d', "--dump") : + dump = True + + main(args, dump=dump) +