degal/lib/exif.py
branchnew-exif
changeset 108 f74d8cf678ce
parent 107 2e2ef5c99985
equal deleted inserted replaced
107:2e2ef5c99985 108:f74d8cf678ce
       
     1 """
       
     2     A custom EXIF parsing module, aimed at high performance.
       
     3 """
       
     4 
       
     5 import struct, mmap, os
       
     6 
       
     7 from utils import lazy_load, lazy_load_iter
       
     8 
       
     9 def read_struct (file, fmt) :
       
    10     """
       
    11         Utility function to read data from the a file using struct
       
    12     """
       
    13     
       
    14     # length of data
       
    15     fmt_size = struct.calcsize(fmt)
       
    16     
       
    17     # get data
       
    18     file_data = file.read(fmt_size)
       
    19     
       
    20     # unpack single item, this should raise an error if file_data is too short
       
    21     return struct.unpack(fmt, file_data)
       
    22 
       
    23 class Buffer (object) :
       
    24     """
       
    25         Wraps a buffer object (anything that supports the python buffer protocol) for read-only access.
       
    26         
       
    27         Includes an offset for relative values, and an endianess for reading binary data.
       
    28     """
       
    29     
       
    30     def __init__ (self, obj, offset=None, size=None, struct_prefix='=') :
       
    31         """
       
    32             Create a new Buffer object with a new underlying buffer, created from the given object, offset and size.
       
    33             
       
    34             The endiannes is given in the form of a struct-module prefix, which should be one of '<' or '>'.
       
    35             Standard size/alignment are assumed.
       
    36         """
       
    37 
       
    38         # store
       
    39         self.buf = buffer(obj, *(arg for arg in (offset, size) if arg is not None))
       
    40         self.offset = offset
       
    41         self.size = size
       
    42         self.prefix = struct_prefix
       
    43     
       
    44     def subregion (self, offset, length=None) :
       
    45         """
       
    46             Create a new sub-Buffer referencing a view of this buffer, at the given offset, and with the given
       
    47             length, if any, and the same struct_prefix.
       
    48         """
       
    49 
       
    50         return Buffer(self.buf, offset, length, struct_prefix=self.prefix)
       
    51     
       
    52     def pread (self, offset, length) :
       
    53         """
       
    54             Read a random-access region of raw data
       
    55         """
       
    56 
       
    57         return self.buf[offset:offset + length]
       
    58     
       
    59     def pread_struct (self, offset, fmt) :
       
    60         """
       
    61             Read structured data using the given struct format from the given offset.
       
    62         """
       
    63 
       
    64         return struct.unpack_from(self.prefix + fmt, self.buf, offset=offset)
       
    65 
       
    66     def pread_item (self, offset, fmt) :
       
    67         """
       
    68             Read a single item of structured data from the given offset.
       
    69         """
       
    70 
       
    71         value, = self.pread_struct(offset, fmt)
       
    72 
       
    73         return value
       
    74 
       
    75     def iter_offsets (self, count, size, offset=0) :
       
    76         """
       
    77             Yield a series of offsets for `count` items of `size` bytes, beginning at `offset`.
       
    78         """
       
    79 
       
    80         return xrange(offset, offset + count * size, size)
       
    81     
       
    82     def item_size (self, fmt) :
       
    83         """
       
    84             Returns the size in bytes of the given item format
       
    85         """
       
    86 
       
    87         return struct.calcsize(self.prefix + fmt)
       
    88 
       
    89     def unpack_item (self, fmt, data) :
       
    90         """
       
    91             Unpacks a single item from the given data
       
    92         """
       
    93 
       
    94         value, = struct.unpack(self.prefix + fmt, data)
       
    95         
       
    96         return value
       
    97 
       
    98 def mmap_buffer (file, size) :
       
    99     """
       
   100         Create and return a new read-only mmap'd region
       
   101     """
       
   102 
       
   103     return mmap.mmap(file.fileno(), size, access=mmap.ACCESS_READ)
       
   104 
       
   105 import exif_data
       
   106 
       
   107 class Tag (object) :
       
   108     """
       
   109         Represents a single Tag in an IFD
       
   110     """
       
   111 
       
   112     def __init__ (self, ifd, offset, tag, type, count, data_raw) :
       
   113         """
       
   114             Build a Tag with the given binary items from the IFD entry
       
   115         """
       
   116         
       
   117         self.ifd = ifd
       
   118         self.offset = offset
       
   119         self.tag = tag
       
   120         self.type = type
       
   121         self.count = count
       
   122         self.data_raw = data_raw
       
   123         
       
   124         # lookup the type for this tag
       
   125         self.type_data = exif_data.FIELD_TYPES.get(type)
       
   126 
       
   127         # unpack it
       
   128         if self.type_data :
       
   129             self.type_format, self.type_name, self.type_func = self.type_data
       
   130     
       
   131         # lookup the tag data for this tag
       
   132         self.tag_data = self.ifd.tag_dict.get(tag)
       
   133         
       
   134     @property
       
   135     def name (self) :
       
   136         """
       
   137             Lookup the name of this tag via its code, returns None if unknown.
       
   138         """
       
   139 
       
   140         if self.tag_data :
       
   141             return self.tag_data.name
       
   142 
       
   143         else :
       
   144             return None
       
   145     
       
   146     def is_subifd (self) :
       
   147         """
       
   148             Tests if this Tag is of a IFDTag type
       
   149         """
       
   150 
       
   151         return self.tag_data and isinstance(self.tag_data, exif_data.IFDTag)
       
   152     
       
   153     @lazy_load
       
   154     def subifd (self) :
       
   155         """
       
   156             Load the sub-IFD for this tag
       
   157         """
       
   158 
       
   159         # the tag_dict to use
       
   160         tag_dict = self.tag_data.ifd_tags or self.ifd.tag_dict
       
   161         
       
   162         # construct, return
       
   163         return self.ifd.exif._load_subifd(self, tag_dict)
       
   164 
       
   165     def process_values (self, raw_values) :
       
   166         """
       
   167             Process the given raw values unpacked from the file.
       
   168         """
       
   169 
       
   170         if self.type_data and self.type_func :
       
   171             # use the filter func
       
   172             return self.type_func(raw_values)
       
   173 
       
   174         else :
       
   175             # nada, just leave them
       
   176             return raw_values
       
   177 
       
   178     def readable_value (self, values) :
       
   179         """
       
   180             Convert the given values for this tag into a human-readable string.
       
   181 
       
   182             Returns the comma-separated values by default.
       
   183         """
       
   184 
       
   185         if self.tag_data :
       
   186             # map it
       
   187             return self.tag_data.map_values(values)
       
   188 
       
   189         else :
       
   190             # default value-mapping
       
   191             return ", ".join(str(value) for value in values)
       
   192 
       
   193 # size of an IFD entry in bytes
       
   194 IFD_ENTRY_SIZE = 12
       
   195 
       
   196 class IFD (Buffer) :
       
   197     """
       
   198         Represents an IFD (Image file directory) region in EXIF data.
       
   199     """
       
   200 
       
   201     def __init__ (self, exif, buffer, tag_dict, **buffer_opts) :
       
   202         """
       
   203             Access the IFD data from the given bufferable object with given buffer opts.
       
   204 
       
   205             This will read the `count` and `next_offset` values.
       
   206         """
       
   207 
       
   208         # init
       
   209         super(IFD, self).__init__(buffer, **buffer_opts)
       
   210 
       
   211         # store
       
   212         self.exif = exif
       
   213         self.tag_dict = tag_dict
       
   214         
       
   215         # read header
       
   216         self.count = self.pread_item(0, 'H')
       
   217 
       
   218         # read next-offset
       
   219         self.next_offset = self.pread_item(0x02 + self.count * IFD_ENTRY_SIZE, 'I')
       
   220     
       
   221     @lazy_load_iter
       
   222     def tags (self) :
       
   223         """
       
   224             Iterate over all the Tag objects in this IFD
       
   225         """
       
   226         
       
   227         # read each tag
       
   228         for offset in self.iter_offsets(self.count, IFD_ENTRY_SIZE, 0x02) :
       
   229             # read the tag data
       
   230             tag, type, count, data_raw = self.pread_struct(offset, 'HHI4s')
       
   231             
       
   232             # yield the new Tag
       
   233             yield Tag(self, self.offset + offset, tag, type, count, data_raw)
       
   234 
       
   235     def get_tags (self, filter=None) :
       
   236         """
       
   237             Yield a series of tag objects for this IFD and all sub-IFDs.
       
   238         """
       
   239         
       
   240         for tag in self.tags :
       
   241             if tag.is_subifd() :
       
   242                 # recurse
       
   243                 for subtag in tag.subifd.get_tags(filter=filter) :
       
   244                     yield subtag
       
   245             
       
   246             else :
       
   247                 # normal tag
       
   248                 yield tag
       
   249 
       
   250 class EXIF (Buffer) :
       
   251     """
       
   252         Represents the EXIF data embedded in some image file in the form of a Region.
       
   253     """
       
   254 
       
   255     def __init__ (self, buffer, **buffer_opts) :
       
   256         """
       
   257             Access the EXIF data from the given bufferable object with the given buffer options.
       
   258         """
       
   259 
       
   260         # init Buffer
       
   261         super(EXIF, self).__init__(buffer, **buffer_opts)
       
   262 
       
   263         # store
       
   264         self.buffer = buffer
       
   265     
       
   266     @lazy_load_iter
       
   267     def ifds (self) :
       
   268         """
       
   269             Iterate over the primary IFDs in this EXIF.
       
   270         """
       
   271 
       
   272         # starting offset
       
   273         offset = self.pread_item(0x04, 'I')
       
   274 
       
   275         while offset :
       
   276             # create and read the IFD, operating on the right sub-buffer
       
   277             ifd = IFD(self, self.buf, exif_data.EXIF_TAGS, offset=offset)
       
   278 
       
   279             # yield it
       
   280             yield ifd
       
   281 
       
   282             # skip to next offset
       
   283             offset = ifd.next_offset
       
   284     
       
   285     def _load_subifd (self, tag, tag_dict) :
       
   286         """
       
   287             Creates and returns a sub-IFD for the given tag.
       
   288         """
       
   289 
       
   290         # locate it
       
   291         offset, = self.tag_values_raw(tag)
       
   292 
       
   293         # construct the new IFD
       
   294         return IFD(self, self.buf, tag_dict, offset=offset)
       
   295 
       
   296     def tag_data_info (self, tag) :
       
   297         """
       
   298             Calculate the location, format and size of the given tag's data.
       
   299 
       
   300             Returns a (fmt, offset, size) tuple.
       
   301         """
       
   302         # unknown tag?
       
   303         if not tag.type_data :
       
   304             return None
       
   305 
       
   306         # data format
       
   307         if len(tag.type_format) == 1 :
       
   308             # let struct handle the count
       
   309             fmt = "%d%s" % (tag.count, tag.type_format)
       
   310 
       
   311         else :
       
   312             # handle the count ourselves
       
   313             fmt = tag.type_format * tag.count
       
   314 
       
   315         # size of the data
       
   316         size = self.item_size(fmt)
       
   317 
       
   318         # inline or external?
       
   319         if size > 0x04 :
       
   320             # point at the external data
       
   321             offset = self.unpack_item('I', tag.data_raw)
       
   322 
       
   323         else :
       
   324             # point at the inline data
       
   325             offset = tag.offset + 0x08
       
   326         
       
   327         return fmt, offset, size
       
   328 
       
   329     def tag_values_raw (self, tag) :
       
   330         """
       
   331             Get the raw values for the given tag as a tuple.
       
   332 
       
   333             Returns None if the tag could not be recognized.
       
   334         """
       
   335 
       
   336         # find the data
       
   337         data_info = self.tag_data_info(tag)
       
   338 
       
   339         # not found?
       
   340         if not data_info :
       
   341             return None
       
   342         
       
   343         # unpack
       
   344         data_fmt, data_offset, data_size = data_info
       
   345         
       
   346         # read values
       
   347         return self.pread_struct(data_offset, data_fmt)
       
   348     
       
   349     def tag_values (self, tag) :
       
   350         """
       
   351             Gets the processed values for the given tag as a list.
       
   352         """
       
   353 
       
   354         # read + process
       
   355         return tag.process_values(self.tag_values_raw(tag))
       
   356 
       
   357     def tag_value (self, tag) :
       
   358         """
       
   359             Return the human-readable string value for the given tag.
       
   360         """
       
   361         
       
   362         # load the raw values
       
   363         values = self.tag_values(tag)
       
   364 
       
   365         # unknown?
       
   366         if not values :
       
   367             return ""
       
   368 
       
   369         # return as comma-separated formatted string, yes
       
   370         return tag.readable_value(values)
       
   371     
       
   372     def get_main_tags (self, **opts) :
       
   373         """
       
   374             Get the tags for the main image's IFD as a dict.
       
   375         """
       
   376 
       
   377         if not self.ifds :
       
   378             # weird case
       
   379             raise Exception("No IFD for main image found")
       
   380 
       
   381         # the main IFD is always the first one
       
   382         main_ifd = self.ifds[0]
       
   383 
       
   384         # do it
       
   385         return dict((tag.name, self.tag_value(tag)) for tag in main_ifd.get_tags(**opts))
       
   386 
       
   387 # mapping from two-byte TIFF byte order marker to struct prefix
       
   388 TIFF_BYTE_ORDER = {
       
   389     'II': '<',
       
   390     'MM': '>',
       
   391 }
       
   392 
       
   393 # "An arbitrary but carefully chosen number (42) that further identifies the file as a TIFF file"
       
   394 TIFF_BYTEORDER_MAGIC = 42 
       
   395 
       
   396 def tiff_load (file, length=0, **opts) :
       
   397     """
       
   398         Load the Exif/TIFF data from the given file at its current position with optional length, using exif_load.
       
   399     """
       
   400 
       
   401     # all Exif data offsets are relative to the beginning of this TIFF header
       
   402     offset = file.tell()
       
   403 
       
   404     # mmap the region for the EXIF data
       
   405     buffer = mmap_buffer(file, length)
       
   406 
       
   407     # read byte-order header
       
   408     byte_order = file.read(2)
       
   409 
       
   410     # map to struct prefix
       
   411     struct_prefix = TIFF_BYTE_ORDER[byte_order]
       
   412 
       
   413     # validate
       
   414     check_value, = read_struct(file, struct_prefix + 'H')
       
   415 
       
   416     if check_value != TIFF_BYTEORDER_MAGIC  :
       
   417         raise Exception("Invalid byte-order for TIFF: %2c -> %d" % (byte_order, check_value))
       
   418 
       
   419     # build and return the EXIF object with the correct offset/size from the mmap region
       
   420     return EXIF(buffer, offset=offset, size=length, **opts)
       
   421 
       
   422 # the JPEG markers that don't have any data
       
   423 JPEG_NOSIZE_MARKERS = (0xD8, 0xD9)
       
   424 
       
   425 # the first marker in a JPEG File
       
   426 JPEG_START_MARKER = 0xD8
       
   427 
       
   428 # the JPEG APP1 marker used for EXIF
       
   429 JPEG_EXIF_MARKER = 0xE1
       
   430 
       
   431 # the JPEG APP1 Exif header
       
   432 JPEG_EXIF_HEADER = "Exif\x00\x00"
       
   433 
       
   434 def jpeg_markers (file) :
       
   435     """
       
   436         Iterate over the JPEG markers in the given file, yielding (type_byte, size) tuples.
       
   437 
       
   438         The size fields will be 0 for markers with no data. The file will be positioned at the beginning of the data
       
   439         region, and may be seek'd around if needed.
       
   440         
       
   441         XXX: find a real implementation of this somewhere?
       
   442     """
       
   443 
       
   444     while True :
       
   445         # read type
       
   446         marker_byte, marker_type = read_struct(file, '!BB')
       
   447         
       
   448         # validate
       
   449         if marker_byte != 0xff :
       
   450             raise Exception("Not a JPEG marker: %x%x" % (marker_byte, marker_type))
       
   451 
       
   452         # special cases for no data
       
   453         if marker_type in JPEG_NOSIZE_MARKERS :
       
   454             size = 0
       
   455 
       
   456         else :
       
   457             # read size field
       
   458             size, = read_struct(file, '!H')
       
   459             
       
   460             # validate
       
   461             if size < 0x02 :
       
   462                 raise Exception("Invalid size for marker %x%x: %x" % (marker_byte, marker_type, size))
       
   463             
       
   464             else :
       
   465                 # do not count the size field itself
       
   466                 size = size - 2
       
   467             
       
   468         # ok, data is at current position
       
   469         offset = file.tell()
       
   470         
       
   471         # yield
       
   472         yield marker_type, size
       
   473 
       
   474         # absolute seek to next marker
       
   475         file.seek(offset + size)
       
   476 
       
   477 def jpeg_find_exif (file) :
       
   478     """
       
   479         Find the Exif/TIFF section in the given JPEG file.
       
   480 
       
   481         If found, the file will be seek'd to the start of the Exif/TIFF header, and the size of the Exif/TIFF data will
       
   482         be returned.
       
   483         
       
   484         Returns None if no EXIF section was found.
       
   485     """
       
   486 
       
   487     for count, (marker, size) in enumerate(jpeg_markers(file)) :
       
   488         # verify that it's a JPEG file
       
   489         if count == 0 :
       
   490             # must start with the right marker
       
   491             if marker != JPEG_START_MARKER :
       
   492                 raise Exception("JPEG file must start with 0xFF%02x marker" % (marker, ))
       
   493 
       
   494         # look for APP1 marker (0xE1) with EXIF signature
       
   495         elif marker == JPEG_EXIF_MARKER and file.read(len(JPEG_EXIF_HEADER)) == JPEG_EXIF_HEADER:
       
   496             # skipped the initial Exif marker signature
       
   497             return size - len(JPEG_EXIF_HEADER)
       
   498 
       
   499     # nothing
       
   500     return None
       
   501 
       
   502 def jpeg_load (file, **opts) :
       
   503     """
       
   504         Loads the embedded Exif TIFF data from the given JPEG file using tiff_load.
       
   505 
       
   506         Returns None if no EXIF data could be found.
       
   507     """
       
   508         
       
   509     # look for the right section
       
   510     size = jpeg_find_exif(file)
       
   511     
       
   512     # not found?
       
   513     if not size :
       
   514         # nothing
       
   515         return
       
   516 
       
   517     else :    
       
   518         # load it as TIFF data
       
   519         return tiff_load(file, size, **opts)
       
   520 
       
   521 def load_path (path, **opts) :
       
   522     """
       
   523         Loads an EXIF object from the given filesystem path.
       
   524 
       
   525         Returns None if it could not be parsed.
       
   526     """
       
   527     
       
   528     # file extension
       
   529     root, fext = os.path.splitext(path)
       
   530 
       
   531     # map
       
   532     func = {
       
   533         '.jpeg':    jpeg_load,
       
   534         '.jpg':     jpeg_load,
       
   535         '.tiff':    tiff_load,  # XXX: untested
       
   536     }.get(fext.lower())
       
   537     
       
   538     # not recognized?
       
   539     if not func :
       
   540         # XXX: sniff the file
       
   541         return None
       
   542 
       
   543     # open it
       
   544     file = open(path, 'rb')
       
   545 
       
   546     # try and load it
       
   547     return func(file, **opts)
       
   548 
       
   549 def dump_tag (exif, i, tag, indent=2) :
       
   550     """
       
   551         Dump the given tag
       
   552     """
       
   553 
       
   554     data_info = exif.tag_data_info(tag)
       
   555 
       
   556     if data_info :
       
   557         data_fmt, data_offset, data_size = data_info
       
   558 
       
   559     else :
       
   560         data_fmt = data_offset = data_size = None
       
   561 
       
   562     print "%sTag:%d offset=%#04x(%#08x), tag=%d/%s, type=%d/%s, count=%d, fmt=%s, offset=%#04x, size=%s, is_subifd=%s:" % (
       
   563         '\t'*indent,
       
   564         i, 
       
   565         tag.offset, tag.offset + exif.offset,
       
   566         tag.tag, tag.name or '???',
       
   567         tag.type, tag.type_name if tag.type_data else '???',
       
   568         tag.count,
       
   569         data_fmt, data_offset, data_size,
       
   570         tag.is_subifd(),
       
   571     )
       
   572     
       
   573     if tag.is_subifd() :
       
   574         # recurse
       
   575         dump_ifd(exif, 0, tag.subifd, indent + 1)
       
   576 
       
   577     else :
       
   578         # dump each value
       
   579         values = exif.tag_values(tag)
       
   580         
       
   581         for i, value in enumerate(values) :
       
   582             print "%s\t%02d: %.120r" % ('\t'*indent, i, value)
       
   583         
       
   584         # and then the readable one
       
   585         print "%s\t->  %.120s" % ('\t'*indent, tag.readable_value(values), )
       
   586 
       
   587 
       
   588 def dump_ifd (exif, i, ifd, indent=1) :
       
   589     """
       
   590         Dump the given IFD, recursively
       
   591     """
       
   592 
       
   593     print "%sIFD:%d offset=%#04x(%#08x), count=%d, next=%d:" % (
       
   594         '\t'*indent,
       
   595         i, 
       
   596         ifd.offset, ifd.offset + exif.offset,
       
   597         ifd.count, 
       
   598         ifd.next_offset
       
   599     )
       
   600     
       
   601     for i, tag in enumerate(ifd.tags) :
       
   602         # dump
       
   603         dump_tag(exif, i, tag, indent + 1)
       
   604 
       
   605 
       
   606 def dump_exif (exif) :
       
   607     """
       
   608         Dump all tags from the given EXIF object to stdout
       
   609     """
       
   610 
       
   611     print "EXIF offset=%#08x, size=%d:" % (exif.offset, exif.size)
       
   612 
       
   613     for i, ifd in enumerate(exif.ifds) :
       
   614         # dump
       
   615         dump_ifd(exif, i, ifd)
       
   616 
       
   617 
       
   618 def list_tags (exif) :
       
   619     """
       
   620         Print a neat listing of tags to stdout
       
   621     """
       
   622 
       
   623     for k, v in exif.get_main_tags().iteritems() :
       
   624         print "%30s: %s" % (k, v)
       
   625 
       
   626 def main_path (path, dump) :
       
   627     # dump path
       
   628     print "%s: " % path
       
   629 
       
   630     # try and load it
       
   631     exif = load_path(path)
       
   632  
       
   633     if not exif :
       
   634         raise Exception("No EXIF data found")
       
   635 
       
   636     if dump :
       
   637         # dump everything
       
   638         dump_exif(exif)
       
   639     
       
   640     else :
       
   641         # list them
       
   642         list_tags(exif)   
       
   643 
       
   644 
       
   645 def main (paths, dump=False) :
       
   646     """
       
   647         Load and dump EXIF data from the given path
       
   648     """
       
   649     
       
   650     # handle each one
       
   651     for path in paths :
       
   652         main_path(path, dump=dump)
       
   653 
       
   654 if __name__ == '__main__' :
       
   655     import getopt
       
   656     from sys import argv
       
   657     
       
   658     # defaults
       
   659     dump = False
       
   660 
       
   661     # parse args
       
   662     opts, args = getopt.getopt(argv[1:], "d", ["dump"])
       
   663 
       
   664     for opt, val in opts :
       
   665         if opt in ('-d', "--dump") :
       
   666             dump = True
       
   667 
       
   668     main(args, dump=dump)
       
   669