1 """ |
|
2 A custom EXIF parsing module, aimed at high performance. |
|
3 """ |
|
4 |
|
5 import struct, mmap, os |
|
6 |
|
7 from utils import lazy_load, lazy_load_iter |
|
8 |
|
9 def read_struct (file, fmt) : |
|
10 """ |
|
11 Utility function to read data from the a file using struct |
|
12 """ |
|
13 |
|
14 # length of data |
|
15 fmt_size = struct.calcsize(fmt) |
|
16 |
|
17 # get data |
|
18 file_data = file.read(fmt_size) |
|
19 |
|
20 # unpack single item, this should raise an error if file_data is too short |
|
21 return struct.unpack(fmt, file_data) |
|
22 |
|
23 class Buffer (object) : |
|
24 """ |
|
25 Wraps a buffer object (anything that supports the python buffer protocol) for read-only access. |
|
26 |
|
27 Includes an offset for relative values, and an endianess for reading binary data. |
|
28 """ |
|
29 |
|
30 def __init__ (self, obj, offset=None, size=None, struct_prefix='=') : |
|
31 """ |
|
32 Create a new Buffer object with a new underlying buffer, created from the given object, offset and size. |
|
33 |
|
34 The endiannes is given in the form of a struct-module prefix, which should be one of '<' or '>'. |
|
35 Standard size/alignment are assumed. |
|
36 """ |
|
37 |
|
38 # store |
|
39 self.buf = buffer(obj, *(arg for arg in (offset, size) if arg is not None)) |
|
40 self.offset = offset |
|
41 self.size = size |
|
42 self.prefix = struct_prefix |
|
43 |
|
44 def subregion (self, offset, length=None) : |
|
45 """ |
|
46 Create a new sub-Buffer referencing a view of this buffer, at the given offset, and with the given |
|
47 length, if any, and the same struct_prefix. |
|
48 """ |
|
49 |
|
50 return Buffer(self.buf, offset, length, struct_prefix=self.prefix) |
|
51 |
|
52 def pread (self, offset, length) : |
|
53 """ |
|
54 Read a random-access region of raw data |
|
55 """ |
|
56 |
|
57 return self.buf[offset:offset + length] |
|
58 |
|
59 def pread_struct (self, offset, fmt) : |
|
60 """ |
|
61 Read structured data using the given struct format from the given offset. |
|
62 """ |
|
63 |
|
64 return struct.unpack_from(self.prefix + fmt, self.buf, offset=offset) |
|
65 |
|
66 def pread_item (self, offset, fmt) : |
|
67 """ |
|
68 Read a single item of structured data from the given offset. |
|
69 """ |
|
70 |
|
71 value, = self.pread_struct(offset, fmt) |
|
72 |
|
73 return value |
|
74 |
|
75 def iter_offsets (self, count, size, offset=0) : |
|
76 """ |
|
77 Yield a series of offsets for `count` items of `size` bytes, beginning at `offset`. |
|
78 """ |
|
79 |
|
80 return xrange(offset, offset + count * size, size) |
|
81 |
|
82 def item_size (self, fmt) : |
|
83 """ |
|
84 Returns the size in bytes of the given item format |
|
85 """ |
|
86 |
|
87 return struct.calcsize(self.prefix + fmt) |
|
88 |
|
89 def unpack_item (self, fmt, data) : |
|
90 """ |
|
91 Unpacks a single item from the given data |
|
92 """ |
|
93 |
|
94 value, = struct.unpack(self.prefix + fmt, data) |
|
95 |
|
96 return value |
|
97 |
|
98 def mmap_buffer (file, size) : |
|
99 """ |
|
100 Create and return a new read-only mmap'd region |
|
101 """ |
|
102 |
|
103 return mmap.mmap(file.fileno(), size, access=mmap.ACCESS_READ) |
|
104 |
|
105 import exif_data |
|
106 |
|
107 class Tag (object) : |
|
108 """ |
|
109 Represents a single Tag in an IFD |
|
110 """ |
|
111 |
|
112 def __init__ (self, ifd, offset, tag, type, count, data_raw) : |
|
113 """ |
|
114 Build a Tag with the given binary items from the IFD entry |
|
115 """ |
|
116 |
|
117 self.ifd = ifd |
|
118 self.offset = offset |
|
119 self.tag = tag |
|
120 self.type = type |
|
121 self.count = count |
|
122 self.data_raw = data_raw |
|
123 |
|
124 # lookup the type for this tag |
|
125 self.type_data = exif_data.FIELD_TYPES.get(type) |
|
126 |
|
127 # unpack it |
|
128 if self.type_data : |
|
129 self.type_format, self.type_name, self.type_func = self.type_data |
|
130 |
|
131 # lookup the tag data for this tag |
|
132 self.tag_data = self.ifd.tag_dict.get(tag) |
|
133 |
|
134 @property |
|
135 def name (self) : |
|
136 """ |
|
137 Lookup the name of this tag via its code, returns None if unknown. |
|
138 """ |
|
139 |
|
140 if self.tag_data : |
|
141 return self.tag_data.name |
|
142 |
|
143 else : |
|
144 return None |
|
145 |
|
146 def is_subifd (self) : |
|
147 """ |
|
148 Tests if this Tag is of a IFDTag type |
|
149 """ |
|
150 |
|
151 return self.tag_data and isinstance(self.tag_data, exif_data.IFDTag) |
|
152 |
|
153 @lazy_load |
|
154 def subifd (self) : |
|
155 """ |
|
156 Load the sub-IFD for this tag |
|
157 """ |
|
158 |
|
159 # the tag_dict to use |
|
160 tag_dict = self.tag_data.ifd_tags or self.ifd.tag_dict |
|
161 |
|
162 # construct, return |
|
163 return self.ifd.exif._load_subifd(self, tag_dict) |
|
164 |
|
165 def process_values (self, raw_values) : |
|
166 """ |
|
167 Process the given raw values unpacked from the file. |
|
168 """ |
|
169 |
|
170 if self.type_data and self.type_func : |
|
171 # use the filter func |
|
172 return self.type_func(raw_values) |
|
173 |
|
174 else : |
|
175 # nada, just leave them |
|
176 return raw_values |
|
177 |
|
178 def readable_value (self, values) : |
|
179 """ |
|
180 Convert the given values for this tag into a human-readable string. |
|
181 |
|
182 Returns the comma-separated values by default. |
|
183 """ |
|
184 |
|
185 if self.tag_data : |
|
186 # map it |
|
187 return self.tag_data.map_values(values) |
|
188 |
|
189 else : |
|
190 # default value-mapping |
|
191 return ", ".join(str(value) for value in values) |
|
192 |
|
193 # size of an IFD entry in bytes |
|
194 IFD_ENTRY_SIZE = 12 |
|
195 |
|
196 class IFD (Buffer) : |
|
197 """ |
|
198 Represents an IFD (Image file directory) region in EXIF data. |
|
199 """ |
|
200 |
|
201 def __init__ (self, exif, buffer, tag_dict, **buffer_opts) : |
|
202 """ |
|
203 Access the IFD data from the given bufferable object with given buffer opts. |
|
204 |
|
205 This will read the `count` and `next_offset` values. |
|
206 """ |
|
207 |
|
208 # init |
|
209 super(IFD, self).__init__(buffer, **buffer_opts) |
|
210 |
|
211 # store |
|
212 self.exif = exif |
|
213 self.tag_dict = tag_dict |
|
214 |
|
215 # read header |
|
216 self.count = self.pread_item(0, 'H') |
|
217 |
|
218 # read next-offset |
|
219 self.next_offset = self.pread_item(0x02 + self.count * IFD_ENTRY_SIZE, 'I') |
|
220 |
|
221 @lazy_load_iter |
|
222 def tags (self) : |
|
223 """ |
|
224 Iterate over all the Tag objects in this IFD |
|
225 """ |
|
226 |
|
227 # read each tag |
|
228 for offset in self.iter_offsets(self.count, IFD_ENTRY_SIZE, 0x02) : |
|
229 # read the tag data |
|
230 tag, type, count, data_raw = self.pread_struct(offset, 'HHI4s') |
|
231 |
|
232 # yield the new Tag |
|
233 yield Tag(self, self.offset + offset, tag, type, count, data_raw) |
|
234 |
|
235 def get_tags (self, filter=None) : |
|
236 """ |
|
237 Yield a series of tag objects for this IFD and all sub-IFDs. |
|
238 """ |
|
239 |
|
240 for tag in self.tags : |
|
241 if tag.is_subifd() : |
|
242 # recurse |
|
243 for subtag in tag.subifd.get_tags(filter=filter) : |
|
244 yield subtag |
|
245 |
|
246 else : |
|
247 # normal tag |
|
248 yield tag |
|
249 |
|
250 class EXIF (Buffer) : |
|
251 """ |
|
252 Represents the EXIF data embedded in some image file in the form of a Region. |
|
253 """ |
|
254 |
|
255 def __init__ (self, buffer, **buffer_opts) : |
|
256 """ |
|
257 Access the EXIF data from the given bufferable object with the given buffer options. |
|
258 """ |
|
259 |
|
260 # init Buffer |
|
261 super(EXIF, self).__init__(buffer, **buffer_opts) |
|
262 |
|
263 # store |
|
264 self.buffer = buffer |
|
265 |
|
266 @lazy_load_iter |
|
267 def ifds (self) : |
|
268 """ |
|
269 Iterate over the primary IFDs in this EXIF. |
|
270 """ |
|
271 |
|
272 # starting offset |
|
273 offset = self.pread_item(0x04, 'I') |
|
274 |
|
275 while offset : |
|
276 # create and read the IFD, operating on the right sub-buffer |
|
277 ifd = IFD(self, self.buf, exif_data.EXIF_TAGS, offset=offset) |
|
278 |
|
279 # yield it |
|
280 yield ifd |
|
281 |
|
282 # skip to next offset |
|
283 offset = ifd.next_offset |
|
284 |
|
285 def _load_subifd (self, tag, tag_dict) : |
|
286 """ |
|
287 Creates and returns a sub-IFD for the given tag. |
|
288 """ |
|
289 |
|
290 # locate it |
|
291 offset, = self.tag_values_raw(tag) |
|
292 |
|
293 # construct the new IFD |
|
294 return IFD(self, self.buf, tag_dict, offset=offset) |
|
295 |
|
296 def tag_data_info (self, tag) : |
|
297 """ |
|
298 Calculate the location, format and size of the given tag's data. |
|
299 |
|
300 Returns a (fmt, offset, size) tuple. |
|
301 """ |
|
302 # unknown tag? |
|
303 if not tag.type_data : |
|
304 return None |
|
305 |
|
306 # data format |
|
307 if len(tag.type_format) == 1 : |
|
308 # let struct handle the count |
|
309 fmt = "%d%s" % (tag.count, tag.type_format) |
|
310 |
|
311 else : |
|
312 # handle the count ourselves |
|
313 fmt = tag.type_format * tag.count |
|
314 |
|
315 # size of the data |
|
316 size = self.item_size(fmt) |
|
317 |
|
318 # inline or external? |
|
319 if size > 0x04 : |
|
320 # point at the external data |
|
321 offset = self.unpack_item('I', tag.data_raw) |
|
322 |
|
323 else : |
|
324 # point at the inline data |
|
325 offset = tag.offset + 0x08 |
|
326 |
|
327 return fmt, offset, size |
|
328 |
|
329 def tag_values_raw (self, tag) : |
|
330 """ |
|
331 Get the raw values for the given tag as a tuple. |
|
332 |
|
333 Returns None if the tag could not be recognized. |
|
334 """ |
|
335 |
|
336 # find the data |
|
337 data_info = self.tag_data_info(tag) |
|
338 |
|
339 # not found? |
|
340 if not data_info : |
|
341 return None |
|
342 |
|
343 # unpack |
|
344 data_fmt, data_offset, data_size = data_info |
|
345 |
|
346 # read values |
|
347 return self.pread_struct(data_offset, data_fmt) |
|
348 |
|
349 def tag_values (self, tag) : |
|
350 """ |
|
351 Gets the processed values for the given tag as a list. |
|
352 """ |
|
353 |
|
354 # read + process |
|
355 return tag.process_values(self.tag_values_raw(tag)) |
|
356 |
|
357 def tag_value (self, tag) : |
|
358 """ |
|
359 Return the human-readable string value for the given tag. |
|
360 """ |
|
361 |
|
362 # load the raw values |
|
363 values = self.tag_values(tag) |
|
364 |
|
365 # unknown? |
|
366 if not values : |
|
367 return "" |
|
368 |
|
369 # return as comma-separated formatted string, yes |
|
370 return tag.readable_value(values) |
|
371 |
|
372 def get_main_tags (self, **opts) : |
|
373 """ |
|
374 Get the tags for the main image's IFD as a dict. |
|
375 """ |
|
376 |
|
377 if not self.ifds : |
|
378 # weird case |
|
379 raise Exception("No IFD for main image found") |
|
380 |
|
381 # the main IFD is always the first one |
|
382 main_ifd = self.ifds[0] |
|
383 |
|
384 # do it |
|
385 return dict((tag.name, self.tag_value(tag)) for tag in main_ifd.get_tags(**opts)) |
|
386 |
|
387 # mapping from two-byte TIFF byte order marker to struct prefix |
|
388 TIFF_BYTE_ORDER = { |
|
389 'II': '<', |
|
390 'MM': '>', |
|
391 } |
|
392 |
|
393 # "An arbitrary but carefully chosen number (42) that further identifies the file as a TIFF file" |
|
394 TIFF_BYTEORDER_MAGIC = 42 |
|
395 |
|
396 def tiff_load (file, length=0, **opts) : |
|
397 """ |
|
398 Load the Exif/TIFF data from the given file at its current position with optional length, using exif_load. |
|
399 """ |
|
400 |
|
401 # all Exif data offsets are relative to the beginning of this TIFF header |
|
402 offset = file.tell() |
|
403 |
|
404 # mmap the region for the EXIF data |
|
405 buffer = mmap_buffer(file, length) |
|
406 |
|
407 # read byte-order header |
|
408 byte_order = file.read(2) |
|
409 |
|
410 # map to struct prefix |
|
411 struct_prefix = TIFF_BYTE_ORDER[byte_order] |
|
412 |
|
413 # validate |
|
414 check_value, = read_struct(file, struct_prefix + 'H') |
|
415 |
|
416 if check_value != TIFF_BYTEORDER_MAGIC : |
|
417 raise Exception("Invalid byte-order for TIFF: %2c -> %d" % (byte_order, check_value)) |
|
418 |
|
419 # build and return the EXIF object with the correct offset/size from the mmap region |
|
420 return EXIF(buffer, offset=offset, size=length, **opts) |
|
421 |
|
422 # the JPEG markers that don't have any data |
|
423 JPEG_NOSIZE_MARKERS = (0xD8, 0xD9) |
|
424 |
|
425 # the first marker in a JPEG File |
|
426 JPEG_START_MARKER = 0xD8 |
|
427 |
|
428 # the JPEG APP1 marker used for EXIF |
|
429 JPEG_EXIF_MARKER = 0xE1 |
|
430 |
|
431 # the JPEG APP1 Exif header |
|
432 JPEG_EXIF_HEADER = "Exif\x00\x00" |
|
433 |
|
434 def jpeg_markers (file) : |
|
435 """ |
|
436 Iterate over the JPEG markers in the given file, yielding (type_byte, size) tuples. |
|
437 |
|
438 The size fields will be 0 for markers with no data. The file will be positioned at the beginning of the data |
|
439 region, and may be seek'd around if needed. |
|
440 |
|
441 XXX: find a real implementation of this somewhere? |
|
442 """ |
|
443 |
|
444 while True : |
|
445 # read type |
|
446 marker_byte, marker_type = read_struct(file, '!BB') |
|
447 |
|
448 # validate |
|
449 if marker_byte != 0xff : |
|
450 raise Exception("Not a JPEG marker: %x%x" % (marker_byte, marker_type)) |
|
451 |
|
452 # special cases for no data |
|
453 if marker_type in JPEG_NOSIZE_MARKERS : |
|
454 size = 0 |
|
455 |
|
456 else : |
|
457 # read size field |
|
458 size, = read_struct(file, '!H') |
|
459 |
|
460 # validate |
|
461 if size < 0x02 : |
|
462 raise Exception("Invalid size for marker %x%x: %x" % (marker_byte, marker_type, size)) |
|
463 |
|
464 else : |
|
465 # do not count the size field itself |
|
466 size = size - 2 |
|
467 |
|
468 # ok, data is at current position |
|
469 offset = file.tell() |
|
470 |
|
471 # yield |
|
472 yield marker_type, size |
|
473 |
|
474 # absolute seek to next marker |
|
475 file.seek(offset + size) |
|
476 |
|
477 def jpeg_find_exif (file) : |
|
478 """ |
|
479 Find the Exif/TIFF section in the given JPEG file. |
|
480 |
|
481 If found, the file will be seek'd to the start of the Exif/TIFF header, and the size of the Exif/TIFF data will |
|
482 be returned. |
|
483 |
|
484 Returns None if no EXIF section was found. |
|
485 """ |
|
486 |
|
487 for count, (marker, size) in enumerate(jpeg_markers(file)) : |
|
488 # verify that it's a JPEG file |
|
489 if count == 0 : |
|
490 # must start with the right marker |
|
491 if marker != JPEG_START_MARKER : |
|
492 raise Exception("JPEG file must start with 0xFF%02x marker" % (marker, )) |
|
493 |
|
494 # look for APP1 marker (0xE1) with EXIF signature |
|
495 elif marker == JPEG_EXIF_MARKER and file.read(len(JPEG_EXIF_HEADER)) == JPEG_EXIF_HEADER: |
|
496 # skipped the initial Exif marker signature |
|
497 return size - len(JPEG_EXIF_HEADER) |
|
498 |
|
499 # nothing |
|
500 return None |
|
501 |
|
502 def jpeg_load (file, **opts) : |
|
503 """ |
|
504 Loads the embedded Exif TIFF data from the given JPEG file using tiff_load. |
|
505 |
|
506 Returns None if no EXIF data could be found. |
|
507 """ |
|
508 |
|
509 # look for the right section |
|
510 size = jpeg_find_exif(file) |
|
511 |
|
512 # not found? |
|
513 if not size : |
|
514 # nothing |
|
515 return |
|
516 |
|
517 else : |
|
518 # load it as TIFF data |
|
519 return tiff_load(file, size, **opts) |
|
520 |
|
521 def load_path (path, **opts) : |
|
522 """ |
|
523 Loads an EXIF object from the given filesystem path. |
|
524 |
|
525 Returns None if it could not be parsed. |
|
526 """ |
|
527 |
|
528 # file extension |
|
529 root, fext = os.path.splitext(path) |
|
530 |
|
531 # map |
|
532 func = { |
|
533 '.jpeg': jpeg_load, |
|
534 '.jpg': jpeg_load, |
|
535 '.tiff': tiff_load, # XXX: untested |
|
536 }.get(fext.lower()) |
|
537 |
|
538 # not recognized? |
|
539 if not func : |
|
540 # XXX: sniff the file |
|
541 return None |
|
542 |
|
543 # open it |
|
544 file = open(path, 'rb') |
|
545 |
|
546 # try and load it |
|
547 return func(file, **opts) |
|
548 |
|
549 def dump_tag (exif, i, tag, indent=2) : |
|
550 """ |
|
551 Dump the given tag |
|
552 """ |
|
553 |
|
554 data_info = exif.tag_data_info(tag) |
|
555 |
|
556 if data_info : |
|
557 data_fmt, data_offset, data_size = data_info |
|
558 |
|
559 else : |
|
560 data_fmt = data_offset = data_size = None |
|
561 |
|
562 print "%sTag:%d offset=%#04x(%#08x), tag=%d/%s, type=%d/%s, count=%d, fmt=%s, offset=%#04x, size=%s, is_subifd=%s:" % ( |
|
563 '\t'*indent, |
|
564 i, |
|
565 tag.offset, tag.offset + exif.offset, |
|
566 tag.tag, tag.name or '???', |
|
567 tag.type, tag.type_name if tag.type_data else '???', |
|
568 tag.count, |
|
569 data_fmt, data_offset, data_size, |
|
570 tag.is_subifd(), |
|
571 ) |
|
572 |
|
573 if tag.is_subifd() : |
|
574 # recurse |
|
575 dump_ifd(exif, 0, tag.subifd, indent + 1) |
|
576 |
|
577 else : |
|
578 # dump each value |
|
579 values = exif.tag_values(tag) |
|
580 |
|
581 for i, value in enumerate(values) : |
|
582 print "%s\t%02d: %.120r" % ('\t'*indent, i, value) |
|
583 |
|
584 # and then the readable one |
|
585 print "%s\t-> %.120s" % ('\t'*indent, tag.readable_value(values), ) |
|
586 |
|
587 |
|
588 def dump_ifd (exif, i, ifd, indent=1) : |
|
589 """ |
|
590 Dump the given IFD, recursively |
|
591 """ |
|
592 |
|
593 print "%sIFD:%d offset=%#04x(%#08x), count=%d, next=%d:" % ( |
|
594 '\t'*indent, |
|
595 i, |
|
596 ifd.offset, ifd.offset + exif.offset, |
|
597 ifd.count, |
|
598 ifd.next_offset |
|
599 ) |
|
600 |
|
601 for i, tag in enumerate(ifd.tags) : |
|
602 # dump |
|
603 dump_tag(exif, i, tag, indent + 1) |
|
604 |
|
605 |
|
606 def dump_exif (exif) : |
|
607 """ |
|
608 Dump all tags from the given EXIF object to stdout |
|
609 """ |
|
610 |
|
611 print "EXIF offset=%#08x, size=%d:" % (exif.offset, exif.size) |
|
612 |
|
613 for i, ifd in enumerate(exif.ifds) : |
|
614 # dump |
|
615 dump_ifd(exif, i, ifd) |
|
616 |
|
617 |
|
618 def list_tags (exif) : |
|
619 """ |
|
620 Print a neat listing of tags to stdout |
|
621 """ |
|
622 |
|
623 for k, v in exif.get_main_tags().iteritems() : |
|
624 print "%30s: %s" % (k, v) |
|
625 |
|
626 def main_path (path, dump) : |
|
627 # dump path |
|
628 print "%s: " % path |
|
629 |
|
630 # try and load it |
|
631 exif = load_path(path) |
|
632 |
|
633 if not exif : |
|
634 raise Exception("No EXIF data found") |
|
635 |
|
636 if dump : |
|
637 # dump everything |
|
638 dump_exif(exif) |
|
639 |
|
640 else : |
|
641 # list them |
|
642 list_tags(exif) |
|
643 |
|
644 |
|
645 def main (paths, dump=False) : |
|
646 """ |
|
647 Load and dump EXIF data from the given path |
|
648 """ |
|
649 |
|
650 # handle each one |
|
651 for path in paths : |
|
652 main_path(path, dump=dump) |
|
653 |
|
654 if __name__ == '__main__' : |
|
655 import getopt |
|
656 from sys import argv |
|
657 |
|
658 # defaults |
|
659 dump = False |
|
660 |
|
661 # parse args |
|
662 opts, args = getopt.getopt(argv[1:], "d", ["dump"]) |
|
663 |
|
664 for opt, val in opts : |
|
665 if opt in ('-d', "--dump") : |
|
666 dump = True |
|
667 |
|
668 main(args, dump=dump) |
|
669 |
|