degal/exif.py
author Tero Marttila <terom@fixme.fi>
Sat, 13 Jun 2009 20:31:51 +0300
branchnew-exif
changeset 104 6afe59e5ffae
parent 103 63e89dc2d6f1
child 105 effae6f38749
permissions -rw-r--r--
tidy up exif_data a bit
"""
    A custom EXIF parsing module, aimed at high performance.
"""

import struct, mmap, os

def read_struct (file, fmt) :
    """
        Utility function to read data from the a file using struct
    """
    
    # length of data
    fmt_size = struct.calcsize(fmt)
    
    # get data
    file_data = file.read(fmt_size)
    
    # unpack single item, this should raise an error if file_data is too short
    return struct.unpack(fmt, file_data)

class Buffer (object) :
    """
        Wraps a buffer object (anything that supports the python buffer protocol) for read-only access.
        
        Includes an offset for relative values, and an endianess for reading binary data.
    """
    
    def __init__ (self, obj, offset=None, size=None, struct_prefix='=') :
        """
            Create a new Buffer object with a new underlying buffer, created from the given object, offset and size.
            
            The endiannes is given in the form of a struct-module prefix, which should be one of '<' or '>'.
            Standard size/alignment are assumed.
        """

        # store
        self.buf = buffer(obj, *(arg for arg in (offset, size) if arg is not None))
        self.offset = offset
        self.size = size
        self.prefix = struct_prefix
    
    def subregion (self, offset, length=None) :
        """
            Create a new sub-Buffer referencing a view of this buffer, at the given offset, and with the given
            length, if any, and the same struct_prefix.
        """

        return Buffer(self.buf, offset, length, struct_prefix=self.prefix)
    
    def pread (self, offset, length) :
        """
            Read a random-access region of raw data
        """

        return self.buf[offset:offset + length]
    
    def pread_struct (self, offset, fmt) :
        """
            Read structured data using the given struct format from the given offset.
        """

        return struct.unpack_from(self.prefix + fmt, self.buf, offset=offset)

    def pread_item (self, offset, fmt) :
        """
            Read a single item of structured data from the given offset.
        """

        value, = self.pread_struct(offset, fmt)

        return value

    def iter_offsets (self, count, size, offset=0) :
        """
            Yield a series of offsets for `count` items of `size` bytes, beginning at `offset`.
        """

        return xrange(offset, offset + count * size, size)
    
    def item_size (self, fmt) :
        """
            Returns the size in bytes of the given item format
        """

        return struct.calcsize(self.prefix + fmt)

    def unpack_item (self, fmt, data) :
        """
            Unpacks a single item from the given data
        """

        value, = struct.unpack(self.prefix + fmt, data)
        
        return value

def mmap_buffer (file, size) :
    """
        Create and return a new read-only mmap'd region
    """

    return mmap.mmap(file.fileno(), size, access=mmap.ACCESS_READ)

import exif_data

class Tag (object) :
    """
        Represents a single Tag in an IFD
    """

    def __init__ (self, offset, tag, type, count, data_raw) :
        """
            Build a Tag with the given binary items from the IFD entry
        """
        
        self.offset = offset
        self.tag = tag
        self.type = type
        self.count = count
        self.data_raw = data_raw
        
        # lookup the type for this tag
        self.type_data = exif_data.FIELD_TYPES.get(type)

        # unpack it
        if self.type_data :
            self.type_format, self.type_name, self.type_func = self.type_data
    
        # lookup the tag data for this tag
        self.tag_data = exif_data.EXIF_TAGS.get(tag)
        
        # unpack it
        if self.tag_data :
            # the EXIF tag name
            self.tag_name, self.tag_value_spec = self.tag_data
            
    @property
    def name (self) :
        """
            Lookup the name of this tag via its code, returns None if unknown.
        """

        if self.tag_data :
            return self.tag_name

        else :
            return None
    
    def process_values (self, raw_values) :
        """
            Process the given raw values unpacked from the file.
        """

        if self.type_data and self.type_func :
            # use the filter func
            return self.type_func(raw_values)

        else :
            # nada, just leave them
            return raw_values

    def readable_value (self, values) :
        """
            Convert the given values for this tag into a human-readable string.

            Returns the comma-separated values by default.
        """

        if self.tag_data :
            spec = self.tag_value_spec

        else :
            # fallback to default
            spec = None

        # map it
        return exif_data.map_values(spec, values)

# size of an IFD entry in bytes
IFD_ENTRY_SIZE = 12

class IFD (Buffer) :
    """
        Represents an IFD (Image file directory) region in EXIF data.
    """

    def __init__ (self, buffer, **buffer_opts) :
        """
            Access the IFD data from the given bufferable object with given buffer opts.

            This will read the `count` and `next_offset` values.
        """

        # init
        super(IFD, self).__init__(buffer, **buffer_opts)
        
        # read header
        self.count = self.pread_item(0, 'H')

        # read next-offset
        self.next_offset = self.pread_item(0x02 + self.count * IFD_ENTRY_SIZE, 'I')
    
    def iter_tags (self) :
        """
            Iterate over all the Tag objects in this IFD
        """
        
        # read each tag
        for offset in self.iter_offsets(self.count, IFD_ENTRY_SIZE, 0x02) :
            # read the tag data
            tag, type, count, data_raw = self.pread_struct(offset, 'HHI4s')
            
            # yield the new Tag
            yield Tag(offset, tag, type, count, data_raw)

class EXIF (Buffer) :
    """
        Represents the EXIF data embedded in some image file in the form of a Region.
    """

    def __init__ (self, buffer, tags=None, **buffer_opts) :
        """
            Access the EXIF data from the given bufferable object with the given buffer options.

            `tags`, if given, specifies that only the given named tags should be loaded.
        """

        # init Buffer
        super(EXIF, self).__init__(buffer, **buffer_opts)

        # store
        self.buffer = buffer
    
    def iter_ifds (self) :
        """
            Iterate over the primary IFDs in this EXIF.
        """

        # starting offset
        offset = self.pread_item(0x04, 'I')

        while offset :
            # create and read the IFD, operating on the right sub-buffer
            ifd = IFD(self.buf, offset=offset)

            # yield it
            yield ifd

            # skip to next offset
            offset = ifd.next_offset
    
    def iter_all_ifds (self) :
        """
            Iterate over all of the IFDs contained within this EXIF, or within other IFDs.
        """

    __iter__ = iter_ifds
    
    def tag_data_info (self, tag) :
        """
            Calculate the location, format and size of the given tag's data.

            Returns a (fmt, offset, size) tuple.
        """
        # unknown tag?
        if not tag.type_data :
            return None

        # data format
        if len(tag.type_format) == 1 :
            # let struct handle the count
            fmt = "%d%s" % (tag.count, tag.type_format)

        else :
            # handle the count ourselves
            fmt = tag.type_format * tag.count

        # size of the data
        size = self.item_size(fmt)

        # inline or external?
        if size > 0x04 :
            # point at the external data
            offset = self.unpack_item('I', tag.data_raw)

        else :
            # point at the inline data
            offset = tag.offset + 0x08
        
        return fmt, offset, size

    def tag_values_raw (self, tag) :
        """
            Get the raw values for the given tag as a tuple.

            Returns None if the tag could not be recognized.
        """

        # find the data
        data_info = self.tag_data_info(tag)

        # not found?
        if not data_info :
            return None
        
        # unpack
        data_fmt, data_offset, data_size = data_info
        
        # read values
        return self.pread_struct(data_offset, data_fmt)
    
    def tag_values (self, tag) :
        """
            Gets the processed values for the given tag as a list.
        """

        # read + process
        return tag.process_values(self.tag_values_raw(tag))

    def tag_value (self, tag) :
        """
            Return the human-readable string value for the given tag.
        """
        
        # load the raw values
        values = self.tag_values(tag)

        # unknown?
        if not values :
            return ""

        # return as comma-separated formatted string, yes
        return tag.readable_value(values)

# mapping from two-byte TIFF byte order marker to struct prefix
TIFF_BYTE_ORDER = {
    'II': '<',
    'MM': '>',
}

# "An arbitrary but carefully chosen number (42) that further identifies the file as a TIFF file"
TIFF_BYTEORDER_MAGIC = 42 

def tiff_load (file, length=0, **opts) :
    """
        Load the Exif/TIFF data from the given file at its current position with optional length, using exif_load.
    """

    # all Exif data offsets are relative to the beginning of this TIFF header
    offset = file.tell()

    # mmap the region for the EXIF data
    buffer = mmap_buffer(file, length)

    # read byte-order header
    byte_order = file.read(2)

    # map to struct prefix
    struct_prefix = TIFF_BYTE_ORDER[byte_order]

    # validate
    check_value, = read_struct(file, struct_prefix + 'H')

    if check_value != TIFF_BYTEORDER_MAGIC  :
        raise Exception("Invalid byte-order for TIFF: %2c -> %d" % (byte_order, check_value))

    # build and return the EXIF object with the correct offset/size from the mmap region
    return EXIF(buffer, offset=offset, size=length, **opts)

# the JPEG markers that don't have any data
JPEG_NOSIZE_MARKERS = (0xD8, 0xD9)

# the first marker in a JPEG File
JPEG_START_MARKER = 0xD8

# the JPEG APP1 marker used for EXIF
JPEG_EXIF_MARKER = 0xE1

# the JPEG APP1 Exif header
JPEG_EXIF_HEADER = "Exif\x00\x00"

def jpeg_markers (file) :
    """
        Iterate over the JPEG markers in the given file, yielding (type_byte, size) tuples.

        The size fields will be 0 for markers with no data. The file will be positioned at the beginning of the data
        region, and may be seek'd around if needed.
        
        XXX: find a real implementation of this somewhere?
    """

    while True :
        # read type
        marker_byte, marker_type = read_struct(file, '!BB')
        
        # validate
        if marker_byte != 0xff :
            raise Exception("Not a JPEG marker: %x%x" % (marker_byte, marker_type))

        # special cases for no data
        if marker_type in JPEG_NOSIZE_MARKERS :
            size = 0

        else :
            # read size field
            size, = read_struct(file, '!H')
            
            # validate
            if size < 0x02 :
                raise Exception("Invalid size for marker %x%x: %x" % (marker_byte, marker_type, size))
            
            else :
                # do not count the size field itself
                size = size - 2
            
        # ok, data is at current position
        offset = file.tell()
        
        # yield
        yield marker_type, size

        # absolute seek to next marker
        file.seek(offset + size)

def jpeg_find_exif (file) :
    """
        Find the Exif/TIFF section in the given JPEG file.

        If found, the file will be seek'd to the start of the Exif/TIFF header, and the size of the Exif/TIFF data will
        be returned.
        
        Returns None if no EXIF section was found.
    """

    for count, (marker, size) in enumerate(jpeg_markers(file)) :
        # verify that it's a JPEG file
        if count == 0 :
            # must start with the right marker
            if marker != JPEG_START_MARKER :
                raise Exception("JPEG file must start with 0xFF%02x marker" % (marker, ))

        # look for APP1 marker (0xE1) with EXIF signature
        elif marker == JPEG_EXIF_MARKER and file.read(len(JPEG_EXIF_HEADER)) == JPEG_EXIF_HEADER:
            # skipped the initial Exif marker signature
            return size - len(JPEG_EXIF_HEADER)

    # nothing
    return None

def jpeg_load (file, **opts) :
    """
        Loads the embedded Exif TIFF data from the given JPEG file using tiff_load.

        Returns None if no EXIF data could be found.
    """
        
    # look for the right section
    size = jpeg_find_exif(file)
    
    # not found?
    if not size :
        # nothing
        return

    else :    
        # load it as TIFF data
        return tiff_load(file, size, **opts)

def load_path (path, **opts) :
    """
        Loads an EXIF object from the given filesystem path.

        Returns None if it could not be parsed.
    """
    
    # file extension
    root, fext = os.path.splitext(path)

    # map
    func = {
        '.jpeg':    jpeg_load,
        '.jpg':     jpeg_load,
        '.tiff':    tiff_load,  # XXX: untested
    }.get(fext.lower())
    
    # not recognized?
    if not func :
        # XXX: sniff the file
        return None

    # open it
    file = open(path, 'rb')

    # try and load it
    return func(file, **opts)

def dump_exif (exif) :
    """
        Dump all tags from the given EXIF object to stdout
    """

    print "EXIF offset=%#08x, size=%d:" % (exif.offset, exif.size)

    for i, ifd in enumerate(exif.iter_ifds()) :
        print "\tIFD:%d offset=%#04x(%#08x), count=%d, next=%d:" % (
            i, 
            ifd.offset, ifd.offset + exif.offset,
            ifd.count, 
            ifd.next_offset
        )
        
        for i, tag in enumerate(ifd.iter_tags()) :
            data_info = exif.tag_data_info(tag)

            if data_info :
                data_fmt, data_offset, data_size = data_info

            else :
                data_fmt = data_offset = data_size = None

            print "\t\tTag:%d offset=%#04x(%#08x), tag=%d/%s, type=%d/%s, count=%d, fmt=%s, offset=%#04x, size=%s:" % (
                i, 
                tag.offset, tag.offset + exif.offset,
                tag.tag, tag.name or '???',
                tag.type, tag.type_name if tag.type_data else '???',
                tag.count,
                data_fmt, data_offset, data_size,
            )

            values = exif.tag_values(tag)
            
            for i, value in enumerate(values) :
                print "\t\t\t%02d: %r" % (i, value)

            print "\t\t\t->  %s" % (tag.readable_value(values), )

def main (path, quiet=False) :
    """
        Load and dump EXIF data from the given path
    """
    
    # try and load it
    exif = load_path(path)

    if not exif :
        raise Exception("No EXIF data found")
    
    if not quiet :
        # dump it
        print "%s: " % path
        print

        dump_exif(exif)

if __name__ == '__main__' :
    from sys import argv

    main(argv[1], '-q' in argv)