degal/exif.py
branchnew-exif
changeset 102 ef2c1ffdca8f
child 103 63e89dc2d6f1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/degal/exif.py	Sat Jun 13 18:34:55 2009 +0300
@@ -0,0 +1,491 @@
+"""
+    A custom EXIF parsing module, aimed at high performance.
+"""
+
+import struct, mmap, os
+
+def read_struct (file, fmt) :
+    """
+        Utility function to read data from the a file using struct
+    """
+    
+    # length of data
+    fmt_size = struct.calcsize(fmt)
+    
+    # get data
+    file_data = file.read(fmt_size)
+    
+    # unpack single item, this should raise an error if file_data is too short
+    return struct.unpack(fmt, file_data)
+
+class Buffer (object) :
+    """
+        Wraps a buffer object (anything that supports the python buffer protocol) for read-only access.
+        
+        Includes an offset for relative values, and an endianess for reading binary data.
+    """
+    
+    def __init__ (self, obj, offset=None, size=None, struct_prefix='=') :
+        """
+            Create a new Buffer object with a new underlying buffer, created from the given object, offset and size.
+            
+            The endiannes is given in the form of a struct-module prefix, which should be one of '<' or '>'.
+            Standard size/alignment are assumed.
+        """
+
+        # store
+        self.buf = buffer(obj, offset, size)
+        self.offset = offset
+        self.size = size
+        self.prefix = struct_prefix
+    
+    def subregion (self, offset, length=None) :
+        """
+            Create a new sub-Buffer referencing a view of this buffer, at the given offset, and with the given
+            length, if any, and the same struct_prefix.
+        """
+
+        return Buffer(self.buf, offset, length, struct_prefix=self.prefix)
+    
+    def pread (self, offset, length) :
+        """
+            Read a random-access region of raw data
+        """
+
+        return self.buf[offset:offset + length]
+    
+    def pread_struct (self, offset, fmt) :
+        """
+            Read structured data using the given struct format from the given offset.
+        """
+
+        return struct.unpack_from(self.prefix + fmt, self.buf, offset=offset)
+
+    def pread_item (self, offset, fmt) :
+        """
+            Read a single item of structured data from the given offset.
+        """
+
+        value, = self.pread_struct(offset, fmt)
+
+        return value
+
+    def iter_offsets (self, count, size, offset=0) :
+        """
+            Yield a series of offsets for `count` items of `size` bytes, beginning at `offset`.
+        """
+
+        return xrange(offset, offset + count * size, size)
+    
+    def item_size (self, fmt) :
+        """
+            Returns the size in bytes of the given item format
+        """
+
+        return struct.calcsize(self.prefix + fmt)
+
+    def unpack_item (self, fmt, data) :
+        """
+            Unpacks a single item from the given data
+        """
+
+        value, = struct.unpack(self.prefix + fmt, data)
+        
+        return value
+
+def mmap_buffer (file, size) :
+    """
+        Create and return a new read-only mmap'd region
+    """
+
+    return mmap.mmap(file.fileno(), size, access=mmap.ACCESS_READ)
+
+import exif_data
+
+class Tag (object) :
+    """
+        Represents a single Tag in an IFD
+    """
+
+    def __init__ (self, offset, tag, type, count, value_ref) :
+        """
+            Build a Tag with the given binary items from the IFD entry
+        """
+        
+        self.offset = offset
+        self.tag = tag
+        self.type = type
+        self.count = count
+        self.value_ref = value_ref
+        
+        # lookup the type for this tag
+        self.type_data = exif_data.FIELD_TYPES.get(type)
+
+        # unpack it
+        if self.type_data :
+            self.type_format, self.type_name = self.type_data
+    
+        # lookup the tag data for this tag
+        self.tag_data = exif_data.EXIF_TAGS.get(tag)
+        
+        # unpack it
+        if self.tag_data :
+            # the EXIF tag name
+            self.tag_name = tag_data[0]
+            
+            # the optional value formatting specification
+            if len(self.tag_data) > 1 :
+                self.tag_value_spec = self.tag_data[1]
+
+            else :
+                self.tag_value_spec = None
+
+    @property
+    def name (self) :
+        """
+            Lookup the name of this tag via its code, returns None if unknown.
+        """
+
+        if self.tag_data :
+            return self.tag_name
+
+        else :
+            return None
+    
+    def readable_value (self, value) :
+        """
+            Convert the given value for this tag into a human-readable string.
+
+            Returns the value itself by default.
+        """
+
+        if self.tag_data and self.tag_value_spec :
+            # map it
+            return exif_data.tag_value(self.tag_value_spec, value)
+
+        else :
+            # nope...
+            return value
+
+# size of an IFD entry in bytes
+IFD_ENTRY_SIZE = 12
+
+class IFD (Buffer) :
+    """
+        Represents an IFD (Image file directory) region in EXIF data.
+    """
+
+    def __init__ (self, buffer, **buffer_opts) :
+        """
+            Access the IFD data from the given bufferable object with given buffer opts.
+
+            This will read the `count` and `next_offset` values.
+        """
+
+        # init
+        super(IFD, self).__init__(buffer, **buffer_opts)
+        
+        # read header
+        self.count = self.pread_item(0, 'H')
+
+        # read next-offset
+        self.next_offset = self.pread_item(0x02 + self.count * IFD_ENTRY_SIZE, 'I')
+    
+    def iter_tags (self) :
+        """
+            Iterate over all the Tag objects in this IFD
+        """
+        
+        # read each tag
+        for offset in self.iter_offsets(self.count, IFD_ENTRY_SIZE, 0x02) :
+            # read the tag data
+            tag, type, count, value_ref = self.pread_struct(offset, 'HHII')
+            
+            # yield the new Tag
+            yield Tag(offset, tag, type, count, value_ref)
+
+class EXIF (Buffer) :
+    """
+        Represents the EXIF data embedded in some image file in the form of a Region.
+    """
+
+    def __init__ (self, buffer, tags=None, **buffer_opts) :
+        """
+            Access the EXIF data from the given bufferable object with the given buffer options.
+
+            `tags`, if given, specifies that only the given named tags should be loaded.
+        """
+
+        # init Buffer
+        super(EXIF, self).__init__(buffer, **buffer_opts)
+
+        # store
+        self.buffer = buffer
+    
+    def iter_ifds (self) :
+        """
+            Iterate over all of the IFD objects in this EXIF.
+        """
+
+        # starting offset
+        offset = self.pread_item(0x04, 'I')
+
+        while offset :
+            # create and read the IFD
+            ifd = IFD(self, offset=offset)
+
+            # yield it
+            yield ifd
+
+            # skip to next offset
+            offset = ifd.next_offset
+    
+    __iter__ = iter_ifds
+    
+    def tag_values (self, tag) :
+        """
+            Get the raw values for the given tag as a tuple.
+
+            Returns None if the tag could not be recognized.
+        """
+
+        # unknown tag?
+        if not tag.type_data :
+            return None
+
+        # size of the data
+        data_size = tag.count * self.item_size(tag.type_format)
+
+        # inline or external?
+        if data_size > 0x04 :
+            # point at the external data
+            offset = self.unpack_item('I', tag.value_ref)
+
+        else :
+            # point at the inline data
+            offset = tag.offset + 0x08
+        
+        # read values
+        return self.pread_struct(offset, "%d%s" % (tag.count, tag.type_format))
+    
+    def tag_value (self, tag) :
+        """
+            Return the human-readable string value for the given tag.
+        """
+        
+        # load the raw values
+        values = self.tag_values(tag)
+
+        # unknown?
+        if not values :
+            return ""
+
+        # return as comma-separated formatted string, yes
+        return ", ".join(tag.readable_value(value) for value in values)
+
+# mapping from two-byte TIFF byte order marker to struct prefix
+TIFF_BYTE_ORDER = {
+    'II': '<',
+    'MM': '>',
+}
+
+# "An arbitrary but carefully chosen number (42) that further identifies the file as a TIFF file"
+TIFF_BYTEORDER_MAGIC = 42 
+
+def tiff_load (file, length=0, **opts) :
+    """
+        Load the Exif/TIFF data from the given file at its current position with optional length, using exif_load.
+    """
+
+    # all Exif data offsets are relative to the beginning of this TIFF header
+    offset = file.tell()
+
+    # mmap the region for the EXIF data
+    buffer = mmap_region(file, length)
+
+    # read byte-order header
+    byte_order = file.read(2)
+
+    # map to struct prefix
+    struct_prefix = TIFF_BYTE_ORDER[byte_order]
+
+    # validate
+    check_value, = read_struct(file, struct_prefix + 'H')
+
+    if check_value != TIFF_BYTEORDER_MAGIC  :
+        raise Exception("Invalid byte-order for TIFF: %2c -> %d" % (byte_order, check_value))
+
+    # build and return the EXIF object with the correct offset/size from the mmap region
+    return EXIF(buffer, offset=offset, size=length, **opts)
+
+# the JPEG markers that don't have any data
+JPEG_NOSIZE_MARKERS = (0xD8, 0xD9)
+
+# the first marker in a JPEG File
+JPEG_START_MARKER = 0xD8
+
+# the JPEG APP1 marker used for EXIF
+JPEG_EXIF_MARKER = 0xE1
+
+# the JPEG APP1 Exif header
+JPEG_EXIF_HEADER = "Exif\x00\x00"
+
+def jpeg_markers (file) :
+    """
+        Iterate over the JPEG markers in the given file, yielding (type_byte, size) tuples.
+
+        The size fields will be 0 for markers with no data. The file will be positioned at the beginning of the data
+        region, and may be seek'd around if needed.
+        
+        XXX: find a real implementation of this somewhere?
+    """
+
+    while True :
+        # read type
+        marker_byte, marker_type = read_struct(file, '!BB')
+        
+        # validate
+        if marker_byte != 0xff :
+            raise Exception("Not a JPEG marker: %x%x" % (marker_byte, marker_type))
+
+        # special cases for no data
+        if marker_byte in JPEG_NOSIZE_MARKERS :
+            size = 0
+
+        else :
+            # read size field
+            size, = read_struct(file, '!H')
+            
+            # validate
+            if size < 0x02 :
+                raise Exception("Invalid size for marker %x%x: %x" % (marker_byte, marker_type, size))
+            
+            else :
+                # do not count the size field itself
+                size = size - 2
+            
+        # ok, data is at current position
+        offset = file.tell()
+        
+        # yield
+        yield marker_type, size
+
+        # absolute seek to next marker
+        file.seek(offset + size)
+
+def jpeg_find_exif (file) :
+    """
+        Find the Exif/TIFF section in the given JPEG file.
+
+        If found, the file will be seek'd to the start of the Exif/TIFF header, and the size of the Exif/TIFF data will
+        be returned.
+        
+        Returns None if no EXIF section was found.
+    """
+
+    for count, (marker, size) in enumerate(jpeg_markers(file)) :
+        # verify that it's a JPEG file
+        if count == 0 :
+            # must start with the right marker
+            if marker != JPEG_START_MARKER :
+                raise Exception("JPEG file must start with 0xFF%02x marker" % (marker, ))
+
+        # look for APP1 marker (0xE1) with EXIF signature
+        elif marker == JPEG_EXIF_MARKER and file.read(len(JPEG_EXIF_HEADER)) == JPEG_EXIF_HEADER:
+            # skipped the initial Exif marker signature
+            return size - JPEG_EXIF_HEADER
+
+    # nothing
+    return None
+
+def jpeg_load (file, **opts) :
+    """
+        Loads the embedded Exif TIFF data from the given JPEG file using tiff_load.
+
+        Returns None if no EXIF data could be found.
+    """
+        
+    # look for the right section
+    size = jpeg_find_exif(file)
+    
+    # not found?
+    if not res :
+        # nothing
+        return
+
+    else :    
+        # load it as TIFF data
+        return tiff_load(file, size, **opts)
+
+def load_path (path, **opts) :
+    """
+        Loads an EXIF object from the given filesystem path.
+
+        Returns None if it could not be parsed.
+    """
+    
+    # file extension
+    root, fext = os.path.splitext(path)
+
+    # map
+    func = {
+        '.jpeg':    jpeg_load,
+        '.jpg':     jpeg_load,
+        '.tiff':    tiff_load,  # XXX: untested
+    }.get(fext.lower())
+    
+    # not recognized?
+    if not func :
+        # XXX: sniff the file
+        return None
+
+    # open it
+    file = open(path, 'rb')
+
+    # try and load it
+    return func(file, **opts)
+
+def dump_exif (exif) :
+    """
+        Dump all tags from the given EXIF object to stdout
+    """
+
+    print "EXIF offset=%d, size=%d:" % (exif.offset, exif.size)
+
+    for i, ifd in enumerate(exif.iter_ifds()) :
+        print "\tIFD %d, offset=%d, size=%d, count=%d, next=%d:" % (i, ifd.offset, ifd.size, ifd.count, ifd.next_offset)
+        
+        for i, tag in enumerate(exif.iter_tags()) :
+            print "\t\tTag %d, offset=%d, tag=%d/%s, type=%d/%s, count=%d:" % (
+                i, 
+                tag.offset,
+                tag.code, tag.name or '???',
+                tag.type, tag.type_name if tag.type_data else '???',
+                tag.count,
+            )
+            
+            for i, value in enumerate(exif.tag_values(tag)) :
+                print "\t\t\t%02d: %s" % (i, tag.readable_value(value))
+
+def main (path) :
+    """
+        Load and dump EXIF data from the given path
+    """
+    
+    # try and load it
+    exif = load_path(path)
+
+    if not exif :
+        raise Exception("No EXIF data found")
+    
+    # dump it
+    print "%s: " % path
+    print
+
+    dump_exif(exif)
+
+if __name__ == '__main__' :
+    from sys import argv
+
+    main(argv[1])
+