bin/search-index
changeset 140 6db2527b67cf
parent 121 86aebc9cb60b
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/search-index	Sun Sep 13 01:15:56 2009 +0300
@@ -0,0 +1,640 @@
+#!/usr/bin/env python2.5
+
+"""
+    Tool for accessing the search index
+"""
+
+# XXX: fix path
+import sys; sys.path.insert(0, '.'); sys.path.insert(0, '..')
+
+import os, os.path, fcntl
+import datetime, pytz
+import optparse
+
+# configuration and the LogSearchIndex module
+from qmsk.irclogs import config, utils, log_search, channels
+
+def _open_index (options, open_mode) :
+    """
+        Opens the LogSearchIndex
+    """
+
+    return log_search.LogSearchIndex(config.LOG_CHANNELS, options.index_path, open_mode)
+
+
+def _open_index_and_channel (options, channel_name, open_mode) :
+    """
+        Opens+returns a LogSearchIndex and a LogChannel
+    """
+    
+    # open the LogSearchIndex
+    index = _open_index(options, open_mode)
+
+    # open the channel
+    channel = config.LOG_CHANNELS.lookup(channel_name)
+    
+    # return
+    return index, channel
+
+def _iter_insert_stats (index, channel, lines) :
+    """
+        Insert the given lines into the index.
+
+        Assumes the lines will be in time-order, and yields a series of (date, count) tuples for every date that lines
+        are inserted for
+    """
+
+    # last date
+    date = None
+
+    # count
+    count = 0
+
+    # iter lines
+    for line in lines :
+        # next day?
+        if not date or line.timestamp.date() != date :
+            if date :
+                # yield stats
+                yield date, count
+
+            # reset count
+            count = 0
+
+            # timestamp's date
+            date = line.timestamp.date()
+
+        # insert
+        index.insert_line(channel, line)
+
+        # count
+        count += 1
+    
+    # final count?
+    if date and count :
+        yield date, count
+
+def _insert_lines (index, options, channel, lines) :
+    """
+        Insert the given lines into the index.
+
+        Assumes the lines will be in time-order, and prints out as status messages the date and count for the inserted lines
+    """
+    
+    # iterate insert stats
+    for date, count in _iter_insert_stats(index, channel, lines) :
+        # output date header?
+        if not options.quiet :
+            print "%s: %s" % (date.strftime('%Y-%m-%d'), count),
+
+def _load_channel_date (index, options, channel, date) :
+    """
+        Loads the logs for the given date from the channel's LogSource into the given LogSearchIndex
+    """
+
+    if not options.quiet :
+        print "Loading date for channel %s" % channel.id
+        
+    try :
+        # load lines for date
+        lines = channel.source.get_date(date)
+    
+    except Exception, e :
+        if not options.skip_missing :
+            raise
+            
+        if not options.quiet :
+            print "\tSkipped: %s" % (e, )
+    
+    else :
+        # insert
+        _insert_lines(index, options, channel, lines)
+
+def _parse_date (options, date_str, tz=None, fmt='%Y-%m-%d') :
+    """
+        Parse the given datetime, using the given timezone(defaults to options.tz) and format
+    """
+
+    # default tz
+    if not tz :
+        tz = options.timezone
+
+    try :
+        # parse
+        return datetime.datetime.strptime(date_str, fmt).replace(tzinfo=tz)
+
+    except Exception, e :
+        raise CommandError("[ERROR] Invalid date: %s: %s" % (date_str, e))
+
+def _output_lines (options, lines) :
+    """
+        Display the formatted LogLines
+    """
+
+    # display as plaintext
+    for line, txt_data in options.formatter.format_txt(lines, full_timestamps=True) :
+        print txt_data
+
+class CommandError (Exception) :
+    """
+        Error with command-line arguments
+    """
+
+    pass
+
+def cmd_create (options) :
+    """
+        Creates a new index
+    """
+
+    # open index
+    index = _open_index(options, 'ctrunc' if options.force else 'c')
+
+    # that's all
+    pass
+
+def cmd_load (options, channel_name, *dates) :
+    """
+        Loads the logs for a specific channel for the given dates (in terms of the channe logs' timezone) into the index
+    """
+
+    # open index/channel
+    index, channel = _open_index_and_channel(options, channel_name, 'c' if options.create else 'a')
+    
+    # handle each date
+    for date_str in dates :
+        # prase date
+        try :
+            date = _parse_date(options, date_str, channel.source.tz)
+        
+        # handle errors
+        except CommandError, e :
+            if options.skip_missing :
+                print "[ERROR] %s" % (date_name, e)
+
+            else :
+                raise
+        
+        # otherwise, load
+        else :        
+            _load_channel_date(index, options, channel, date)
+
+def cmd_load_month (options, channel_name, *months) :
+    """
+        Loads the logs for a specific channel for the given months (in terms of the channel's timezone) into the index
+    """
+
+    # open index/channel
+    index, channel = _open_index_and_channel(options, channel_name, 'c' if options.create else 'a')
+    
+    # handle each date
+    for month_str in months :
+        # prase date
+        try :
+            month = _parse_date(options, month_str, channel.source.tz, '%Y-%m')
+        
+        # handle errors
+        except CommandError, e :
+            # skip?
+            if options.skip_missing :
+                if not options.quiet :
+                    print "[ERROR] %s" % (date_name, e)
+                continue
+
+            else :
+                raise
+        
+        # get the set of days
+        days = list(channel.source.get_month_days(month))
+        
+        if not options.quiet :
+            print "Loading %d days of logs:" % (len(days))
+
+        # load each day
+        for date in days :
+            # convert to datetime
+            dt = datetime.datetime.combine(date, datetime.time(0)).replace(tzinfo=channel.source.tz)
+            
+            # load
+            _load_channel_date(index, options, channel, dt)
+
+def cmd_search (options, channel_name, query) :
+    """
+        Search the index for events on a specific channel with the given query
+    """
+    
+    # sanity-check
+    if options.create :
+        raise Exception("--create doesn't make sense for 'search'")
+    
+    # open index/channel
+    index, channel = _open_index_and_channel(options, channel_name, 'r')
+    
+    # search
+    lines = index.search_simple(channel, query)
+    
+    # display
+    _output_lines(options, lines)
+
+def cmd_list (options, channel_name, *dates) :
+    """
+        List the indexed events for a specific date
+    """
+
+    # sanity-check
+    if options.create :
+        raise Exception("--create doesn't make sense for 'search'")
+    
+    # open index/channel
+    index, channel = _open_index_and_channel(options, channel_name, 'r')
+
+    # ...for each date
+    for date_str in dates :
+        # parse date
+        date = _parse_date(options, date_str)
+
+        # list
+        lines = index.list(channel, date)
+        
+        # display
+        _output_lines(options, lines)
+
+def _autoload_reset (options, channels) :
+    """
+        Reset old autoload state
+    """
+    
+    # warn
+    if not options.quiet :
+        print "[WARN] Resetting autoload state for: %s" % ', '.join(channel.id for channel in channels)
+    
+    # iter
+    for channel in channels :
+        # statefile path
+        statefile_path = os.path.join(options.autoload_state_path, 'chan-%s' % channel.id)
+
+        # is it present?
+        if not os.path.exists(statefile_path) :
+            if not options.quiet :
+                print "[WARN] No statefile found at %s" % statefile_path
+        
+        else :
+            if not options.quiet :
+                print "\t%s: " % channel.id,
+
+            # remove the statefile
+            os.remove(statefile_path)
+            
+            if not options.quiet :
+                print "OK"
+
+def cmd_autoload (options, *channel_names) :
+    """
+        Automatically loads all channel logs that have not been indexed yet (by logfile mtime)
+    """
+    
+    # open index, nonblocking
+    index = _open_index(options, 'c?' if options.create else 'a?')
+
+    # default to all channels
+    if not channel_names :
+        channels = config.LOG_CHANNELS
+    
+    else :
+        channels = [config.LOG_CHANNELS.lookup(channel_name) for channel_name in channel_names]
+    
+    # reset autoload state?
+    if options.reset :
+        _autoload_reset(options, channels)
+        if not options.quiet :
+            print
+
+    # iterate channels
+    for channel in channels :
+        if not options.quiet :
+            print "Channel %s:" % channel.id
+
+        # no 'from' by default
+        after = None
+
+        # path to our state file
+        statefile_path = os.path.join(options.autoload_state_path, 'chan-%s' % channel.id)
+        statefile_tmppath = statefile_path + '.tmp'
+
+        # does it exist?
+        have_tmpfile = os.path.exists(statefile_tmppath)
+        
+        # do we have a tempfile from a previous crash?
+        if have_tmpfile and not options.ignore_resume :
+            # first, open it...
+            statefile_tmp = open(statefile_tmppath, 'r+')
+
+            # ... then lock it
+            fcntl.lockf(statefile_tmp, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            
+            # read after timestamp
+            after_str = statefile_tmp.read().rstrip()
+
+            if after_str :
+                # parse timestamp
+                after = utils.from_utc_timestamp(int(after_str))
+
+                if not options.quiet :
+                    print "\tContinuing earlier progress from %s" % after
+
+            else :
+                # ignore
+                if not options.quiet :
+                    print "\t[WARN] Ignoring empty temporary statefile"
+
+        else :
+            # warn about old tmpfile that was ignored
+            if have_tmpfile and not options.quiet :
+                print "\t[WARN] Ignoring old tmpfile state"
+
+            # open new tempfile
+            statefile_tmp = open(statefile_tmppath, 'w')
+            
+            # lock
+            fcntl.lockf(statefile_tmp, fcntl.LOCK_EX | fcntl.LOCK_NB)
+
+        # override?
+        if options.reload :
+            # load all
+            mtime = None
+
+            if not options.quiet :
+                print "\tForcing reload!"
+
+        # stat for mtime
+        else :
+            # stat for mtime, None if unknown
+            mtime = utils.mtime(statefile_path, ignore_missing=True)
+
+            if mtime and not options.quiet :
+                print "\tLast load time was %s" % mtime
+
+            elif not options.quiet :
+                print "\t[WARN] No previous load state! Loading full logs"
+ 
+        # only after some specific date?
+        if options.after :
+            # use unless read from tempfile
+            if not after :
+                after = options.after
+               
+                if not options.quiet :
+                    print "\tOnly including dates from %s onwards" % after
+            
+            else :
+                if not options.quiet :
+                    print "\t[WARN] Ignoring --from because we found a tempfile"
+            
+        # only up to some specific date?
+        if options.until :
+            until = options.until
+
+            if not options.quiet :
+                print "\tOnly including dates up to (and including) %s" % until
+        else :
+            # default to now
+            until = None
+
+        # get lines
+        lines = channel.source.get_modified(mtime, after, until)
+        
+        # insert
+        if not options.quiet :
+            print "\tLoading and inserting..."
+            print
+     
+        # iterate insert() per day to display info and update progress
+        for date, count in _iter_insert_stats(index, channel, lines) :
+            # output date header?
+            if not options.quiet :
+                print "\t%10s: %d" % (date.strftime('%Y-%m-%d'), count)
+            
+            # write temp state
+            statefile_tmp.seek(0)
+            statefile_tmp.write(str(utils.to_utc_timestamp(datetime.datetime.combine(date, datetime.time(0)))))
+            statefile_tmp.flush()
+
+        # write autoload state
+        open(statefile_path, 'w').close()
+
+        # close+delete tempfile
+        statefile_tmp.close()
+        os.remove(statefile_tmppath)
+        
+        if not options.quiet :
+            print
+    
+    # done
+    return
+
+def cmd_help (options, *args) :
+    """
+        Help about commands
+    """
+
+    import inspect
+    
+    # general help stuff
+    options._parser.print_help()
+
+    # specific command?
+    if args :
+        # the command name
+        command, = args
+        
+        # XXX: display info about specific command
+        xxx
+    
+    # general
+    else :
+        print
+        print "Available commands:"
+
+        # build list of all cmd_* objects
+        cmd_objects = [(name, obj) for name, obj in globals().iteritems() if name.startswith('cmd_') and inspect.isfunction(obj)]
+
+        # sort alphabetically
+        cmd_objects.sort()
+        
+        # iterate through all cmd_* objects
+        for cmd_func_name, cmd_func in cmd_objects :
+            # remove cmd_ prefix
+            cmd_name = cmd_func_name[4:]
+
+            # inspect
+            cmd_args, cmd_varargs, cmd_varkw, cmd_default = inspect.getargspec(cmd_func)
+            cmd_doc = inspect.getdoc(cmd_func)
+
+            # remove the "options" arg
+            cmd_args = cmd_args[1:]
+
+            # display
+            print "\t%10s %-30s : %s" % (cmd_name, inspect.formatargspec(cmd_args, cmd_varargs, None, cmd_default), cmd_doc)
+
+class MyOption (optparse.Option) :
+    """
+        Our custom types for optparse
+    """
+
+    def check_date (option, opt, value) :
+        """
+            Parse a date
+        """
+
+        try :
+            # parse
+            return datetime.datetime.strptime(value, '%Y-%m-%d')
+        
+        # trap -> OptionValueError
+        except Exception, e :
+            raise optparse.OptionValueError("option %s: invalid date value: %r" % (opt, value))
+    
+    def check_timezone (option, opt, value) :
+        """
+            Parse a timezone
+        """
+
+        try :
+            # parse
+            return pytz.timezone(value)
+        
+        # trap -> OptionValueError
+        except Exception, e :
+            raise optparse.OptionValueError("option %s: invalid timezone: %r" % (opt, value))
+
+    def take_action (self, action, dest, opt, value, values, parser) :
+        """
+            Override take_action to handle date
+        """
+
+        if action == "parse_date" :
+            # get timezone
+            tz = values.timezone
+
+            # set timezone
+            value = value.replace(tzinfo=tz)
+
+            # store
+            return optparse.Option.take_action(self, 'store', dest, opt, value, values, parser)
+
+        else :
+            # default
+            return optparse.Option.take_action(self, action, dest, opt, value, values, parser)
+
+    TYPES = optparse.Option.TYPES + ('date', 'timezone')
+    TYPE_CHECKER = optparse.Option.TYPE_CHECKER.copy()
+    TYPE_CHECKER['date'] = check_date
+    TYPE_CHECKER['timezone'] = check_timezone
+    ACTIONS = optparse.Option.ACTIONS + ('parse_date', )
+    STORE_ACTIONS = optparse.Option.STORE_ACTIONS + ('parse_date', )
+    TYPED_ACTIONS = optparse.Option.TYPED_ACTIONS + ('parse_date', )
+    ACTIONS = optparse.Option.ACTIONS + ('parse_date', )
+
+def main (argv) :
+    """
+        Command-line main, with given argv
+    """
+
+    # define parser
+    parser = optparse.OptionParser(
+        usage           = "%prog [options] <command> [ ... ]",
+        add_help_option = False,
+        option_class    = MyOption,
+    )
+
+    # general options       #                   #                       #                                   #
+    general = optparse.OptionGroup(parser, "General Options")
+    general.add_option('-h', "--help",          dest="help",            help="Show this help message and exit",     
+                                                action="store_true"                                         )
+
+    general.add_option(     "--formatter",      dest="formatter_name",  help="LogFormatter to use",                 
+            metavar="FMT",  type="choice",                              default=config.PREF_FORMATTER_DEFAULT.name,
+            choices=[fmt_name for fmt_name in config.LOG_FORMATTERS.iterkeys()]                             )
+
+    general.add_option(     "--index",          dest="index_path",      help="Index database path",                 
+            metavar="PATH",                                             default=config.SEARCH_INDEX_PATH    )
+
+    general.add_option(     "--timezone",       dest="timezone",        help="Timezone for output",                 
+            metavar="TZ",   type="timezone",                            default=pytz.utc                    )
+
+    general.add_option(     "--force",          dest="force",           help="Force dangerous operation",           
+                                                action="store_true"                                         )
+
+    general.add_option(     "--quiet",          dest="quiet",           help="Supress status messages",             
+                                                action="store_true"                                         )
+    parser.add_option_group(general)
+    
+
+    # cmd_load options      #                   #                       #                                   #
+    load = optparse.OptionGroup(parser, "Load Options")
+    load.add_option(        "--skip-missing",   dest="skip_missing",    help="Skip missing logfiles",
+                                                action="store_true"                                         )
+
+    load.add_option(        "--create",         dest="create",          help="Create index database", 
+                                                action="store_true"                                         )
+    parser.add_option_group(load)
+    
+
+    # cmd_autoload options  #                   #                       #                                   #
+    autoload = optparse.OptionGroup(parser, "Autoload Options")
+    autoload.add_option(    "--autoload-state", dest="autoload_state_path", help="Path to autoload state dir",      
+            metavar="PATH",                                             default=config.SEARCH_AUTOINDEX_PATH)
+
+    autoload.add_option(    "--from",           dest="after",           help="Only autoload logfiles from the given date on", 
+            metavar="DATE", type="date",        action="parse_date",    default=None                        )
+
+    autoload.add_option(    "--until",          dest="until",           help="Only autoload logfiles up to (and including) the given date",  
+            metavar="DATE", type="date",        action="parse_date",    default=None                        )
+
+    autoload.add_option(    "--reload",         dest="reload",          help="Force reload lines",
+                                                action="store_true"                                         )
+
+    autoload.add_option(    "--reset",          dest="reset",           help="Reset old autload state",
+                                                action="store_true"                                         )
+
+    autoload.add_option(    "--ignore-resume",  dest="ignore_resume",   help="Do not try and resume interrupted autoload",  
+                                                action="store_true"                                         )
+    parser.add_option_group(autoload)
+
+    # parse
+    options, args = parser.parse_args(argv[1:])
+
+    # postprocess stuff
+    options._parser = parser
+    options.formatter = config.LOG_FORMATTERS[options.formatter_name](options.timezone, "%H:%M:%S", None, None)
+
+    # special-case --help
+    if options.help :
+        return cmd_help(options, *args)
+    
+    # must have at least the command argument
+    if not args :
+        raise CommandError("Missing command")
+    
+    # pop command
+    command = args.pop(0)
+    
+    # get func
+    func = globals().get('cmd_%s' % command)
+    
+    # unknown command?
+    if not func :
+        raise CommandError("Unknown command: %s" % command)
+    
+    # call
+    func(options, *args)
+
+if __name__ == '__main__' :
+    try :
+        main(sys.argv)
+        sys.exit(0)
+
+    except CommandError, e :
+        print e
+        sys.exit(1)
+