rsync-snapshot: manage --link-dest'd interval snapshots
authorTero Marttila <terom@paivola.fi>
Tue, 14 Feb 2012 21:51:30 +0200
changeset 12 fbfdde7326f4
parent 11 f8dd32bf0e89
child 13 bbda233b91c8
rsync-snapshot: manage --link-dest'd interval snapshots
.hgignore
README
pvl/backup/invoke.py
pvl/backup/rsync.py
scripts/pvlbackup-rsync-snapshot
scripts/pvlbackup-rsync-wrapper
--- a/.hgignore	Tue Feb 14 20:29:11 2012 +0200
+++ b/.hgignore	Tue Feb 14 21:51:30 2012 +0200
@@ -3,6 +3,7 @@
 \.sw[op]$
 \.pyc$
 
+^test/
 ^misc/
 ^dist/
 ^MANIFEST$
--- a/README	Tue Feb 14 20:29:11 2012 +0200
+++ b/README	Tue Feb 14 21:51:30 2012 +0200
@@ -21,3 +21,7 @@
             TODO:
                 * fsck?
 
+    rsync-snapshot:
+        Manage hardlinked snapshots stored for configureable intervals.
+
+        Uses rsync --link-dest.
--- a/pvl/backup/invoke.py	Tue Feb 14 20:29:11 2012 +0200
+++ b/pvl/backup/invoke.py	Tue Feb 14 21:51:30 2012 +0200
@@ -45,17 +45,56 @@
 
     return stdout
 
+def process_opt (opt, value) :
+    """
+        Mangle from python keyword-argument dict format to command-line option tuple format.
+
+        >>> process_opt('foo', True)
+        ('--foo',)
+        >>> process_opt('foo', False)
+        ()
+        >>> process_opt('foo', 2)
+        ('--foo', '2')
+        >>> process_opt('foo', 'bar')
+        ('--foo', 'bar')
+        >>> process_opt('foo_bar', 'asdf')
+        ('--foo-bar', 'asdf')
+
+        # XXX: weird?
+        >>> process_opt('bar', '')
+        ('--bar', '')
+
+        Returns a tuple of argv items.
+    """
+
+    # mangle opt
+    opt = '--' + opt.replace('_', '-')
+
+    if value is True :
+        # flag opt
+        return (opt, )
+
+    elif value is False or value is None:
+        # flag opt / omit
+        return ( )
+
+    else :
+        # as-is
+        return (opt, str(value))
+
 def optargs (*args, **kwargs) :
     """
         Convert args/options into command-line format
     """
 
+    ## opts
     # process
-    opts = [('--{opt}'.format(opt=opt), value if value != True else None) for opt, value in kwargs.iteritems() if value]
+    opts = [process_opt(opt, value) for opt, value in kwargs.iteritems()]
 
     # flatten
     opts = [str(opt_part) for opt_parts in opts for opt_part in opt_parts if opt_part]
 
+    ## args
     args = [str(arg) for arg in args if arg]
 
     return opts + args
@@ -71,4 +110,8 @@
 
     # invoke
     return invoke(cmd, optargs(*args, **opts))
- 
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
+
--- a/pvl/backup/rsync.py	Tue Feb 14 20:29:11 2012 +0200
+++ b/pvl/backup/rsync.py	Tue Feb 14 21:51:30 2012 +0200
@@ -4,9 +4,9 @@
     Apologies for the 'RSync' nomenclature
 """
 
-from pvl.backup.invoke import invoke
 from pvl.backup.lvm import LVM, LVMVolume, LVMSnapshot
 from pvl.backup.mount import mount
+from pvl.backup import invoke
 
 import shlex
 import os.path
@@ -15,6 +15,15 @@
 
 log = logging.getLogger('pvl.backup.rsync')
 
+# Path to rsync binary
+RSYNC = '/usr/bin/rsync'
+
+def rsync (source, dest, **opts) :
+    """
+        Run rsync.
+    """
+
+    invoke.command(RSYNC, source, dest, **opts)
 
 class RSyncCommandFormatError (Exception) :
     """
@@ -23,36 +32,39 @@
 
     pass
 
-class RSyncSource (object) :
-    RSYNC = '/usr/bin/rsync'
+class RSyncServer (object) :
+    """
+        rsync server-mode execution.
+    """
 
     def _execute (self, options, path) :
         """
             Underlying rsync just reads from filesystem.
         """
+        
+        # invoke directly, no option-handling, nor stdin/out redirection
+        invoke.invoke(RSYNC, options + ['.', path], data=False)
 
-        invoke(self.RSYNC, options + ['.', path], data=False)
-
-class RSyncFSSource (RSyncSource) :
+class RSyncFSServer (RSyncServer) :
     """
         Normal filesystem backup.
     """
 
     def __init__ (self, path) :
-        RSyncSource.__init__(self)
+        RSyncServer.__init__(self)
 
         self.path = path
 
     def execute (self, options) :
         return self._execute(options, self.path)
 
-class RSyncLVMSource (RSyncSource) :
+class RSyncLVMServer (RSyncServer) :
     """
         Backup LVM LV by snapshotting + mounting it.
     """
 
     def __init__ (self, volume) :
-        RSyncSource.__init__(self)
+        RSyncServer.__init__(self)
 
         self.volume = volume
  
@@ -177,7 +189,7 @@
         # XXX: how to handle=
         log.info("filesystem: %s", path)
 
-        return RSyncFSSource(path)
+        return RSyncFSServer(path)
 
     elif path.startswith('lvm:') :
         # LVM LV
@@ -195,10 +207,9 @@
         lvm = LVM(vg)
         volume = lvm.volume(lv)
 
-        return RSyncLVMSource(volume)
+        return RSyncLVMServer(volume)
        
     else :
         # invalid
         raise RSyncCommandFormatError("Unrecognized backup path")
 
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/pvlbackup-rsync-snapshot	Tue Feb 14 21:51:30 2012 +0200
@@ -0,0 +1,429 @@
+#!/usr/bin/python
+
+"""
+    Manage rsync --link-dest based snapshots.
+
+    rsync's from <src> to <dst>/snapshots/YYYY-MM-DD-HH-MM-SS using --link-dest <dst>/current.
+
+    Updates symlink <dst>/current -> <dst>/snapshots/...
+
+    Then archives <dst>/current to <dst>/<period>/<date> using --link-dest.
+"""
+
+from pvl.backup import rsync
+
+import optparse
+import os, os.path, stat
+import shutil
+import datetime
+import logging
+
+log = logging.getLogger()
+
+# command-line options
+options = None
+
+def parse_options (argv) :
+    """
+        Parse command-line arguments.
+    """
+
+    parser = optparse.OptionParser(
+            prog        = argv[0],
+            usage       = '%prog: [options] --source <src> --destination <dst>',
+
+            # module docstring
+            # XXX: breaks multi-line descriptions..
+            description = __doc__,
+    )
+
+    # logging
+    general = optparse.OptionGroup(parser, "General Options")
+
+    general.add_option('-q', '--quiet',      dest='loglevel', action='store_const', const=logging.WARNING, help="Less output")
+    general.add_option('-v', '--verbose',    dest='loglevel', action='store_const', const=logging.INFO,  help="More output")
+    general.add_option('-D', '--debug',      dest='loglevel', action='store_const', const=logging.DEBUG, help="Even more output")
+
+    parser.add_option_group(general)
+
+    #
+    parser.add_option('-s', '--source',     metavar='RSYNC-PATH',
+        help="Backup source in rsync-syntax")
+
+    parser.add_option('-d', '--destination',    metavar='RSYNC-PATH',
+        help="Backup destination in rsync-syntax")
+
+    parser.add_option('--interval',         metavar='NAME', action='append', dest='intervals',
+        help="Enable given interval")
+
+    parser.add_option('--clean-intervals',  action='store_true',
+        help="Clean out old interval links")
+
+    parser.add_option('--clean-snapshots',  action='store_true',
+        help="Clean out unused snapshots (those not linked to)")
+
+    parser.add_option('--clean',             action='store_true',
+        help="Clean out both intervals and snapshots")
+
+    parser.add_option('-n', '--dry-run',    action='store_true',
+        help="Don't actually clean anything")
+
+    # defaults
+    parser.set_defaults(
+        loglevel    = logging.WARNING,
+        
+        snapshot_format = '%Y%m%d-%H%M%S',
+
+        ## XXX: configure somehow
+        # rsync options, in invoke.optargs format
+        rsync_options = {
+            'archive':          True,
+            'hard-links':       True,
+            'one-file-system':  True,
+            'numeric-ids':      True,
+            'delete':           True,
+        },
+
+        # datetime formats for intervals
+        interval_format = {
+            'all':      None,       # default to snapshot_format
+            'day':      '%Y-%m-%d',
+            'week':     '%Y-%W',
+            'month':    '%Y-%m',
+            'year':     '%Y',
+        },
+
+        # retention for intervals
+        interval_retention = {
+            'all':      4,
+            'day':      7,
+            'week':     4,
+            'month':    4,
+            'year':     1,
+        },
+
+        # selected intervals
+        intervals       = [],
+    )
+
+    # parse
+    options, args = parser.parse_args(argv[1:])
+
+    # validate
+    if not options.destination :
+        parser.error("--destination is required")
+
+    # configure
+    logging.basicConfig(
+        format  = '%(processName)s: %(name)s: %(levelname)s %(funcName)s : %(message)s',
+        level   = options.loglevel,
+    )
+
+    if options.clean :
+        options.clean_intervals = options.clean_snapshots = options.clean
+
+    return options, args
+
+def run_snapshot (options) :
+    """
+        Perform the rsync from source to given path.
+    """
+
+    snapshot_dir = os.path.join(options.destination, 'snapshots')
+
+    if not os.path.exists(snapshot_dir) :
+        log.warn("Creating snapshots dir: %s", snapshot_dir)
+        os.mkdir(snapshot_dir)
+    
+    # new snapshot
+    snapshot_name = options.now.strftime(options.snapshot_format)
+    snapshot_path = os.path.join(snapshot_dir, snapshot_name)
+    temp_path = os.path.join(snapshot_dir, 'new')
+
+    if os.path.exists(temp_path) :
+        raise Exception("Old temp snapshot dir remains, please clean up: {path}".format(path=temp_path))
+
+    log.info("Perform main snapshot: %s", snapshot_path)
+
+    # build rsync options
+    opts = dict(options.rsync_options)
+
+    if os.path.exists(options.current_path) :
+        # use as link-dest base; hardlinks unchanged files
+        opts['link-dest'] = options.current_path
+
+    # go
+    log.debug("rsync %s -> %s", options.source, temp_path)
+    rsync.rsync(options.source, temp_path, **opts)
+
+    # move in to final name
+    log.debug("rename %s -> %s", temp_path, snapshot_path)
+    os.rename(temp_path, snapshot_path)
+
+    return snapshot_name
+
+def update_interval (options, snapshot_name, interval) :
+    """
+        Update the interval/... links
+    """
+
+    dir_path = os.path.join(options.destination, interval)
+
+    if not os.path.exists(dir_path) :
+        log.warn("Creating interval dir: %s", dir_path)
+        os.mkdir(dir_path)
+    
+    # format code
+    name_fmt = options.interval_format[interval]
+
+    if name_fmt is None :
+        # keep all snapshots
+        name_fmt = options.snapshot_format
+
+    # name
+    name = options.now.strftime(name_fmt)
+
+    # path
+    path_name = os.path.join(interval, name)
+    path = os.path.join(options.destination, path_name)
+
+    log.debug("processing %s", path_name)
+
+    # already there?
+    if os.path.exists(path) :
+        target = os.readlink(path)
+
+        log.info("Found existing %s: %s -> %s", interval, name, target)
+
+    else :
+        # update
+        target = os.path.join('..', 'snapshots', snapshot_name)
+
+        log.info("Updating %s: %s -> %s", interval, name, target)
+        log.debug("%s -> %s", path, target)
+
+        os.symlink(target, path)
+
+
+def clean_interval (options, interval) :
+    """
+        Clean out old entries from interval dir.
+    """
+
+    # path
+    dir_path = os.path.join(options.destination, interval)
+
+    if not os.path.exists(dir_path) :
+        log.warn("%s: Skipping, no interval dir: %s", interval, dir_path)
+        return
+
+    # configured
+    retention = options.interval_retention[interval]
+
+    # clean?
+    items = os.listdir(dir_path)
+    items.sort()
+
+    log.info("%s: Have %d / %d items", interval, len(items), retention)
+    log.debug("%s: items: %s", interval, ' '.join(items))
+
+    if len(items) > retention :
+        # clean out
+        clean = items[retention:]
+
+        log.info("%s: Cleaning out %d items", interval, len(clean))
+        log.debug("%s: cleaning out: %s", interval, ' '.join(clean))
+
+        for item in clean :
+            path = os.path.join(dir_path, item)
+
+            log.info("%s: Clean: %s", interval, path)
+
+            os.unlink(path)
+
+def walk_symlinks (tree, ignore=False) :
+    """
+        Walk through all symlinks in given dir, yielding:
+
+            (dirpath, name, target)
+
+        Passes through errors from os.listdir/os.lstat.
+    """
+
+    for name in os.listdir(tree) :
+        if ignore and name in ignore :
+            log.debug("%s: ignore: %s", tree, name)
+            continue
+
+        path = os.path.join(tree, name)
+        
+        # stat symlink itself
+        st = os.lstat(path)
+
+        if stat.S_ISDIR(st.st_mode) :
+            # recurse
+            log.debug("%s: tree: %s", tree, name)
+
+            for item in walk_symlinks(path) :
+                yield item
+
+        elif stat.S_ISLNK(st.st_mode) :
+            # found
+            target = os.readlink(path)
+
+            log.debug("%s: link: %s -> %s", tree, name, target)
+
+            yield tree, name, target
+
+        else :
+            log.debug("%s: skip: %s", tree, name)
+
+
+def clean_snapshots (options) :
+    """
+        Clean out all snapshots not linked to from within dest.
+
+        Fails without doing anything if unable to read the destination dir.
+    """
+
+    # real path to snapshots
+    snapshots_path = os.path.realpath(os.path.abspath(os.path.join(options.destination, 'snapshots')))
+    log.debug("real snapshots_path: %s", snapshots_path)
+
+    # set of found targets
+    found = set()
+
+    # walk all symlinks
+    for dirpath, name, target in walk_symlinks(options.destination, ignore=set(['snapshots'])) :
+        # target dir
+        target_path = os.path.realpath(os.path.join(dirpath, target))
+        target_dir = os.path.dirname(target_path)
+        target_name = os.path.basename(target_path)
+
+        if target_dir == snapshots_path :
+            log.debug("%s: found: %s -> %s", dirpath, name, target_name)
+            found.add(target_name)
+
+        else :
+            log.debug("%s: ignore: %s -> %s", dirpath, name, target_path)
+
+    # discover all snapshots
+    snapshots = set(os.listdir(snapshots_path))
+
+    # clean out special names
+    snapshots = snapshots - set(['new'])
+
+    ## compare
+    used = snapshots & found
+    unused = snapshots - found
+    broken = found - snapshots
+
+    log.info("Found used=%d, unused=%d, broken=%d snapshot symlinks", len(used), len(unused), len(broken))
+    log.debug("used=%s, unused=%s", used, unused)
+
+    if broken :
+        log.warn("Found broken symlinks to snapshots: %s", ' '.join(broken))
+    
+    if unused :
+        log.info("Clean out unused snapshots: %s", ' '.join(unused))
+
+        for name in unused :
+            path = os.path.join(snapshots_path, name)
+
+            log.info("Clean: %s", name)
+
+            if not options.dry_run :
+                log.debug("rmtree: %s", path)
+
+                # nuke
+                shutil.rmtree(path)
+
+            else :
+                log.debug("dry-run: %s", path)
+
+def run (options) :
+    """
+        Perform the current snapshot
+    """
+
+    # timestamp for run
+    options.now = datetime.datetime.now()
+
+    # snapshot from source?
+    if options.source :
+        # base snapshot (symlink)
+        options.current_path = os.path.join(options.destination, 'current')
+
+        log.info("Started snapshot run at: %s", options.now)
+
+        # initial rsync
+        snapshot_name = run_snapshot(options)
+
+        # update current
+        log.info("Updating current -> %s", snapshot_name)
+
+        if os.path.islink(options.current_path) :
+            # replace
+            os.unlink(options.current_path)
+
+        os.symlink(os.path.join('snapshots', snapshot_name), options.current_path)
+
+        # intervals?
+        if not options.intervals :
+            log.info("No --intervals given; not running any")
+
+        else :
+            # maintain intervals
+            log.info("Running intervals: %s", options.intervals)
+
+            for interval in options.intervals :
+                log.debug("%s", interval)
+
+                # update
+                update_interval(options, snapshot_name, interval)
+
+    # clean intervals?
+    if options.clean_intervals:
+        for interval in options.intervals :
+            log.info("Cleaning interval: %s...", interval)
+
+            clean_interval(options, interval)
+
+    # clean snapshots?
+    if options.clean_snapshots :
+        log.info("Cleaning snapshots...")
+
+        clean_snapshots(options)
+
+    # ok
+    return 1
+
+def main (argv) :
+    global options
+
+    # global options + args
+    options, args = parse_options(argv)
+
+    # XXX: args?
+    if args :
+        log.error("No arguments are handled")
+        return 2
+
+    try :
+        # handle it
+        return run(options)
+
+    except Exception, e:
+        log.error("Internal error:", exc_info=e)
+        return 3
+
+    # ok
+    return 0
+
+
+
+if __name__ == '__main__' :
+    import sys
+
+    sys.exit(main(sys.argv))
+
--- a/scripts/pvlbackup-rsync-wrapper	Tue Feb 14 20:29:11 2012 +0200
+++ b/scripts/pvlbackup-rsync-wrapper	Tue Feb 14 21:51:30 2012 +0200
@@ -117,6 +117,7 @@
         log.error("No arguments are handled")
         return 2
 
+    # command required
     if not options.command:
         log.error("SSH_ORIGINAL_COMMAND not given")
         return 2