bin/search-index
changeset 143 154d2d8ae9c0
parent 142 e163794ccf54
child 144 35c4c56f1376
equal deleted inserted replaced
142:e163794ccf54 143:154d2d8ae9c0
     1 #!/usr/bin/env python2.5
       
     2 
       
     3 """
       
     4     Tool for accessing the search index
       
     5 """
       
     6 
       
     7 # XXX: fix path
       
     8 import sys; sys.path.insert(0, '.'); sys.path.insert(0, '..')
       
     9 
       
    10 import os, os.path, fcntl
       
    11 import datetime, pytz
       
    12 import optparse
       
    13 
       
    14 # configuration and the LogSearchIndex module
       
    15 from qmsk.irclogs import config, utils, log_search, channels
       
    16 
       
    17 def _open_index (options, open_mode) :
       
    18     """
       
    19         Opens the LogSearchIndex
       
    20     """
       
    21 
       
    22     return log_search.LogSearchIndex(config.LOG_CHANNELS, options.index_path, open_mode)
       
    23 
       
    24 
       
    25 def _open_index_and_channel (options, channel_name, open_mode) :
       
    26     """
       
    27         Opens+returns a LogSearchIndex and a LogChannel
       
    28     """
       
    29     
       
    30     # open the LogSearchIndex
       
    31     index = _open_index(options, open_mode)
       
    32 
       
    33     # open the channel
       
    34     channel = config.LOG_CHANNELS.lookup(channel_name)
       
    35     
       
    36     # return
       
    37     return index, channel
       
    38 
       
    39 def _iter_insert_stats (index, channel, lines) :
       
    40     """
       
    41         Insert the given lines into the index.
       
    42 
       
    43         Assumes the lines will be in time-order, and yields a series of (date, count) tuples for every date that lines
       
    44         are inserted for
       
    45     """
       
    46 
       
    47     # last date
       
    48     date = None
       
    49 
       
    50     # count
       
    51     count = 0
       
    52 
       
    53     # iter lines
       
    54     for line in lines :
       
    55         # next day?
       
    56         if not date or line.timestamp.date() != date :
       
    57             if date :
       
    58                 # yield stats
       
    59                 yield date, count
       
    60 
       
    61             # reset count
       
    62             count = 0
       
    63 
       
    64             # timestamp's date
       
    65             date = line.timestamp.date()
       
    66 
       
    67         # insert
       
    68         index.insert_line(channel, line)
       
    69 
       
    70         # count
       
    71         count += 1
       
    72     
       
    73     # final count?
       
    74     if date and count :
       
    75         yield date, count
       
    76 
       
    77 def _insert_lines (index, options, channel, lines) :
       
    78     """
       
    79         Insert the given lines into the index.
       
    80 
       
    81         Assumes the lines will be in time-order, and prints out as status messages the date and count for the inserted lines
       
    82     """
       
    83     
       
    84     # iterate insert stats
       
    85     for date, count in _iter_insert_stats(index, channel, lines) :
       
    86         # output date header?
       
    87         if not options.quiet :
       
    88             print "%s: %s" % (date.strftime('%Y-%m-%d'), count),
       
    89 
       
    90 def _load_channel_date (index, options, channel, date) :
       
    91     """
       
    92         Loads the logs for the given date from the channel's LogSource into the given LogSearchIndex
       
    93     """
       
    94 
       
    95     if not options.quiet :
       
    96         print "Loading date for channel %s" % channel.id
       
    97         
       
    98     try :
       
    99         # load lines for date
       
   100         lines = channel.source.get_date(date)
       
   101     
       
   102     except Exception, e :
       
   103         if not options.skip_missing :
       
   104             raise
       
   105             
       
   106         if not options.quiet :
       
   107             print "\tSkipped: %s" % (e, )
       
   108     
       
   109     else :
       
   110         # insert
       
   111         _insert_lines(index, options, channel, lines)
       
   112 
       
   113 def _parse_date (options, date_str, tz=None, fmt='%Y-%m-%d') :
       
   114     """
       
   115         Parse the given datetime, using the given timezone(defaults to options.tz) and format
       
   116     """
       
   117 
       
   118     # default tz
       
   119     if not tz :
       
   120         tz = options.timezone
       
   121 
       
   122     try :
       
   123         # parse
       
   124         return datetime.datetime.strptime(date_str, fmt).replace(tzinfo=tz)
       
   125 
       
   126     except Exception, e :
       
   127         raise CommandError("[ERROR] Invalid date: %s: %s" % (date_str, e))
       
   128 
       
   129 def _output_lines (options, lines) :
       
   130     """
       
   131         Display the formatted LogLines
       
   132     """
       
   133 
       
   134     # display as plaintext
       
   135     for line, txt_data in options.formatter.format_txt(lines, full_timestamps=True) :
       
   136         print txt_data
       
   137 
       
   138 class CommandError (Exception) :
       
   139     """
       
   140         Error with command-line arguments
       
   141     """
       
   142 
       
   143     pass
       
   144 
       
   145 def cmd_create (options) :
       
   146     """
       
   147         Creates a new index
       
   148     """
       
   149 
       
   150     # open index
       
   151     index = _open_index(options, 'ctrunc' if options.force else 'c')
       
   152 
       
   153     # that's all
       
   154     pass
       
   155 
       
   156 def cmd_load (options, channel_name, *dates) :
       
   157     """
       
   158         Loads the logs for a specific channel for the given dates (in terms of the channe logs' timezone) into the index
       
   159     """
       
   160 
       
   161     # open index/channel
       
   162     index, channel = _open_index_and_channel(options, channel_name, 'c' if options.create else 'a')
       
   163     
       
   164     # handle each date
       
   165     for date_str in dates :
       
   166         # prase date
       
   167         try :
       
   168             date = _parse_date(options, date_str, channel.source.tz)
       
   169         
       
   170         # handle errors
       
   171         except CommandError, e :
       
   172             if options.skip_missing :
       
   173                 print "[ERROR] %s" % (date_name, e)
       
   174 
       
   175             else :
       
   176                 raise
       
   177         
       
   178         # otherwise, load
       
   179         else :        
       
   180             _load_channel_date(index, options, channel, date)
       
   181 
       
   182 def cmd_load_month (options, channel_name, *months) :
       
   183     """
       
   184         Loads the logs for a specific channel for the given months (in terms of the channel's timezone) into the index
       
   185     """
       
   186 
       
   187     # open index/channel
       
   188     index, channel = _open_index_and_channel(options, channel_name, 'c' if options.create else 'a')
       
   189     
       
   190     # handle each date
       
   191     for month_str in months :
       
   192         # prase date
       
   193         try :
       
   194             month = _parse_date(options, month_str, channel.source.tz, '%Y-%m')
       
   195         
       
   196         # handle errors
       
   197         except CommandError, e :
       
   198             # skip?
       
   199             if options.skip_missing :
       
   200                 if not options.quiet :
       
   201                     print "[ERROR] %s" % (date_name, e)
       
   202                 continue
       
   203 
       
   204             else :
       
   205                 raise
       
   206         
       
   207         # get the set of days
       
   208         days = list(channel.source.get_month_days(month))
       
   209         
       
   210         if not options.quiet :
       
   211             print "Loading %d days of logs:" % (len(days))
       
   212 
       
   213         # load each day
       
   214         for date in days :
       
   215             # convert to datetime
       
   216             dt = datetime.datetime.combine(date, datetime.time(0)).replace(tzinfo=channel.source.tz)
       
   217             
       
   218             # load
       
   219             _load_channel_date(index, options, channel, dt)
       
   220 
       
   221 def cmd_search (options, channel_name, query) :
       
   222     """
       
   223         Search the index for events on a specific channel with the given query
       
   224     """
       
   225     
       
   226     # sanity-check
       
   227     if options.create :
       
   228         raise Exception("--create doesn't make sense for 'search'")
       
   229     
       
   230     # open index/channel
       
   231     index, channel = _open_index_and_channel(options, channel_name, 'r')
       
   232     
       
   233     # search
       
   234     lines = index.search_simple(channel, query)
       
   235     
       
   236     # display
       
   237     _output_lines(options, lines)
       
   238 
       
   239 def cmd_list (options, channel_name, *dates) :
       
   240     """
       
   241         List the indexed events for a specific date
       
   242     """
       
   243 
       
   244     # sanity-check
       
   245     if options.create :
       
   246         raise Exception("--create doesn't make sense for 'search'")
       
   247     
       
   248     # open index/channel
       
   249     index, channel = _open_index_and_channel(options, channel_name, 'r')
       
   250 
       
   251     # ...for each date
       
   252     for date_str in dates :
       
   253         # parse date
       
   254         date = _parse_date(options, date_str)
       
   255 
       
   256         # list
       
   257         lines = index.list(channel, date)
       
   258         
       
   259         # display
       
   260         _output_lines(options, lines)
       
   261 
       
   262 def _autoload_reset (options, channels) :
       
   263     """
       
   264         Reset old autoload state
       
   265     """
       
   266     
       
   267     # warn
       
   268     if not options.quiet :
       
   269         print "[WARN] Resetting autoload state for: %s" % ', '.join(channel.id for channel in channels)
       
   270     
       
   271     # iter
       
   272     for channel in channels :
       
   273         # statefile path
       
   274         statefile_path = os.path.join(options.autoload_state_path, 'chan-%s' % channel.id)
       
   275 
       
   276         # is it present?
       
   277         if not os.path.exists(statefile_path) :
       
   278             if not options.quiet :
       
   279                 print "[WARN] No statefile found at %s" % statefile_path
       
   280         
       
   281         else :
       
   282             if not options.quiet :
       
   283                 print "\t%s: " % channel.id,
       
   284 
       
   285             # remove the statefile
       
   286             os.remove(statefile_path)
       
   287             
       
   288             if not options.quiet :
       
   289                 print "OK"
       
   290 
       
   291 def cmd_autoload (options, *channel_names) :
       
   292     """
       
   293         Automatically loads all channel logs that have not been indexed yet (by logfile mtime)
       
   294     """
       
   295     
       
   296     # open index, nonblocking
       
   297     index = _open_index(options, 'c?' if options.create else 'a?')
       
   298 
       
   299     # default to all channels
       
   300     if not channel_names :
       
   301         channels = config.LOG_CHANNELS
       
   302     
       
   303     else :
       
   304         channels = [config.LOG_CHANNELS.lookup(channel_name) for channel_name in channel_names]
       
   305     
       
   306     # reset autoload state?
       
   307     if options.reset :
       
   308         _autoload_reset(options, channels)
       
   309         if not options.quiet :
       
   310             print
       
   311 
       
   312     # iterate channels
       
   313     for channel in channels :
       
   314         if not options.quiet :
       
   315             print "Channel %s:" % channel.id
       
   316 
       
   317         # no 'from' by default
       
   318         after = None
       
   319 
       
   320         # path to our state file
       
   321         statefile_path = os.path.join(options.autoload_state_path, 'chan-%s' % channel.id)
       
   322         statefile_tmppath = statefile_path + '.tmp'
       
   323 
       
   324         # does it exist?
       
   325         have_tmpfile = os.path.exists(statefile_tmppath)
       
   326         
       
   327         # do we have a tempfile from a previous crash?
       
   328         if have_tmpfile and not options.ignore_resume :
       
   329             # first, open it...
       
   330             statefile_tmp = open(statefile_tmppath, 'r+')
       
   331 
       
   332             # ... then lock it
       
   333             fcntl.lockf(statefile_tmp, fcntl.LOCK_EX | fcntl.LOCK_NB)
       
   334             
       
   335             # read after timestamp
       
   336             after_str = statefile_tmp.read().rstrip()
       
   337 
       
   338             if after_str :
       
   339                 # parse timestamp
       
   340                 after = utils.from_utc_timestamp(int(after_str))
       
   341 
       
   342                 if not options.quiet :
       
   343                     print "\tContinuing earlier progress from %s" % after
       
   344 
       
   345             else :
       
   346                 # ignore
       
   347                 if not options.quiet :
       
   348                     print "\t[WARN] Ignoring empty temporary statefile"
       
   349 
       
   350         else :
       
   351             # warn about old tmpfile that was ignored
       
   352             if have_tmpfile and not options.quiet :
       
   353                 print "\t[WARN] Ignoring old tmpfile state"
       
   354 
       
   355             # open new tempfile
       
   356             statefile_tmp = open(statefile_tmppath, 'w')
       
   357             
       
   358             # lock
       
   359             fcntl.lockf(statefile_tmp, fcntl.LOCK_EX | fcntl.LOCK_NB)
       
   360 
       
   361         # override?
       
   362         if options.reload :
       
   363             # load all
       
   364             mtime = None
       
   365 
       
   366             if not options.quiet :
       
   367                 print "\tForcing reload!"
       
   368 
       
   369         # stat for mtime
       
   370         else :
       
   371             # stat for mtime, None if unknown
       
   372             mtime = utils.mtime(statefile_path, ignore_missing=True)
       
   373 
       
   374             if mtime and not options.quiet :
       
   375                 print "\tLast load time was %s" % mtime
       
   376 
       
   377             elif not options.quiet :
       
   378                 print "\t[WARN] No previous load state! Loading full logs"
       
   379  
       
   380         # only after some specific date?
       
   381         if options.after :
       
   382             # use unless read from tempfile
       
   383             if not after :
       
   384                 after = options.after
       
   385                
       
   386                 if not options.quiet :
       
   387                     print "\tOnly including dates from %s onwards" % after
       
   388             
       
   389             else :
       
   390                 if not options.quiet :
       
   391                     print "\t[WARN] Ignoring --from because we found a tempfile"
       
   392             
       
   393         # only up to some specific date?
       
   394         if options.until :
       
   395             until = options.until
       
   396 
       
   397             if not options.quiet :
       
   398                 print "\tOnly including dates up to (and including) %s" % until
       
   399         else :
       
   400             # default to now
       
   401             until = None
       
   402 
       
   403         # get lines
       
   404         lines = channel.source.get_modified(mtime, after, until)
       
   405         
       
   406         # insert
       
   407         if not options.quiet :
       
   408             print "\tLoading and inserting..."
       
   409             print
       
   410      
       
   411         # iterate insert() per day to display info and update progress
       
   412         for date, count in _iter_insert_stats(index, channel, lines) :
       
   413             # output date header?
       
   414             if not options.quiet :
       
   415                 print "\t%10s: %d" % (date.strftime('%Y-%m-%d'), count)
       
   416             
       
   417             # write temp state
       
   418             statefile_tmp.seek(0)
       
   419             statefile_tmp.write(str(utils.to_utc_timestamp(datetime.datetime.combine(date, datetime.time(0)))))
       
   420             statefile_tmp.flush()
       
   421 
       
   422         # write autoload state
       
   423         open(statefile_path, 'w').close()
       
   424 
       
   425         # close+delete tempfile
       
   426         statefile_tmp.close()
       
   427         os.remove(statefile_tmppath)
       
   428         
       
   429         if not options.quiet :
       
   430             print
       
   431     
       
   432     # done
       
   433     return
       
   434 
       
   435 def cmd_help (options, *args) :
       
   436     """
       
   437         Help about commands
       
   438     """
       
   439 
       
   440     import inspect
       
   441     
       
   442     # general help stuff
       
   443     options._parser.print_help()
       
   444 
       
   445     # specific command?
       
   446     if args :
       
   447         # the command name
       
   448         command, = args
       
   449         
       
   450         # XXX: display info about specific command
       
   451         xxx
       
   452     
       
   453     # general
       
   454     else :
       
   455         print
       
   456         print "Available commands:"
       
   457 
       
   458         # build list of all cmd_* objects
       
   459         cmd_objects = [(name, obj) for name, obj in globals().iteritems() if name.startswith('cmd_') and inspect.isfunction(obj)]
       
   460 
       
   461         # sort alphabetically
       
   462         cmd_objects.sort()
       
   463         
       
   464         # iterate through all cmd_* objects
       
   465         for cmd_func_name, cmd_func in cmd_objects :
       
   466             # remove cmd_ prefix
       
   467             cmd_name = cmd_func_name[4:]
       
   468 
       
   469             # inspect
       
   470             cmd_args, cmd_varargs, cmd_varkw, cmd_default = inspect.getargspec(cmd_func)
       
   471             cmd_doc = inspect.getdoc(cmd_func)
       
   472 
       
   473             # remove the "options" arg
       
   474             cmd_args = cmd_args[1:]
       
   475 
       
   476             # display
       
   477             print "\t%10s %-30s : %s" % (cmd_name, inspect.formatargspec(cmd_args, cmd_varargs, None, cmd_default), cmd_doc)
       
   478 
       
   479 class MyOption (optparse.Option) :
       
   480     """
       
   481         Our custom types for optparse
       
   482     """
       
   483 
       
   484     def check_date (option, opt, value) :
       
   485         """
       
   486             Parse a date
       
   487         """
       
   488 
       
   489         try :
       
   490             # parse
       
   491             return datetime.datetime.strptime(value, '%Y-%m-%d')
       
   492         
       
   493         # trap -> OptionValueError
       
   494         except Exception, e :
       
   495             raise optparse.OptionValueError("option %s: invalid date value: %r" % (opt, value))
       
   496     
       
   497     def check_timezone (option, opt, value) :
       
   498         """
       
   499             Parse a timezone
       
   500         """
       
   501 
       
   502         try :
       
   503             # parse
       
   504             return pytz.timezone(value)
       
   505         
       
   506         # trap -> OptionValueError
       
   507         except Exception, e :
       
   508             raise optparse.OptionValueError("option %s: invalid timezone: %r" % (opt, value))
       
   509 
       
   510     def take_action (self, action, dest, opt, value, values, parser) :
       
   511         """
       
   512             Override take_action to handle date
       
   513         """
       
   514 
       
   515         if action == "parse_date" :
       
   516             # get timezone
       
   517             tz = values.timezone
       
   518 
       
   519             # set timezone
       
   520             value = value.replace(tzinfo=tz)
       
   521 
       
   522             # store
       
   523             return optparse.Option.take_action(self, 'store', dest, opt, value, values, parser)
       
   524 
       
   525         else :
       
   526             # default
       
   527             return optparse.Option.take_action(self, action, dest, opt, value, values, parser)
       
   528 
       
   529     TYPES = optparse.Option.TYPES + ('date', 'timezone')
       
   530     TYPE_CHECKER = optparse.Option.TYPE_CHECKER.copy()
       
   531     TYPE_CHECKER['date'] = check_date
       
   532     TYPE_CHECKER['timezone'] = check_timezone
       
   533     ACTIONS = optparse.Option.ACTIONS + ('parse_date', )
       
   534     STORE_ACTIONS = optparse.Option.STORE_ACTIONS + ('parse_date', )
       
   535     TYPED_ACTIONS = optparse.Option.TYPED_ACTIONS + ('parse_date', )
       
   536     ACTIONS = optparse.Option.ACTIONS + ('parse_date', )
       
   537 
       
   538 def main (argv) :
       
   539     """
       
   540         Command-line main, with given argv
       
   541     """
       
   542 
       
   543     # define parser
       
   544     parser = optparse.OptionParser(
       
   545         usage           = "%prog [options] <command> [ ... ]",
       
   546         add_help_option = False,
       
   547         option_class    = MyOption,
       
   548     )
       
   549 
       
   550     # general options       #                   #                       #                                   #
       
   551     general = optparse.OptionGroup(parser, "General Options")
       
   552     general.add_option('-h', "--help",          dest="help",            help="Show this help message and exit",     
       
   553                                                 action="store_true"                                         )
       
   554 
       
   555     general.add_option(     "--formatter",      dest="formatter_name",  help="LogFormatter to use",                 
       
   556             metavar="FMT",  type="choice",                              default=config.PREF_FORMATTER_DEFAULT.name,
       
   557             choices=[fmt_name for fmt_name in config.LOG_FORMATTERS.iterkeys()]                             )
       
   558 
       
   559     general.add_option(     "--index",          dest="index_path",      help="Index database path",                 
       
   560             metavar="PATH",                                             default=config.SEARCH_INDEX_PATH    )
       
   561 
       
   562     general.add_option(     "--timezone",       dest="timezone",        help="Timezone for output",                 
       
   563             metavar="TZ",   type="timezone",                            default=pytz.utc                    )
       
   564 
       
   565     general.add_option(     "--force",          dest="force",           help="Force dangerous operation",           
       
   566                                                 action="store_true"                                         )
       
   567 
       
   568     general.add_option(     "--quiet",          dest="quiet",           help="Supress status messages",             
       
   569                                                 action="store_true"                                         )
       
   570     parser.add_option_group(general)
       
   571     
       
   572 
       
   573     # cmd_load options      #                   #                       #                                   #
       
   574     load = optparse.OptionGroup(parser, "Load Options")
       
   575     load.add_option(        "--skip-missing",   dest="skip_missing",    help="Skip missing logfiles",
       
   576                                                 action="store_true"                                         )
       
   577 
       
   578     load.add_option(        "--create",         dest="create",          help="Create index database", 
       
   579                                                 action="store_true"                                         )
       
   580     parser.add_option_group(load)
       
   581     
       
   582 
       
   583     # cmd_autoload options  #                   #                       #                                   #
       
   584     autoload = optparse.OptionGroup(parser, "Autoload Options")
       
   585     autoload.add_option(    "--autoload-state", dest="autoload_state_path", help="Path to autoload state dir",      
       
   586             metavar="PATH",                                             default=config.SEARCH_AUTOINDEX_PATH)
       
   587 
       
   588     autoload.add_option(    "--from",           dest="after",           help="Only autoload logfiles from the given date on", 
       
   589             metavar="DATE", type="date",        action="parse_date",    default=None                        )
       
   590 
       
   591     autoload.add_option(    "--until",          dest="until",           help="Only autoload logfiles up to (and including) the given date",  
       
   592             metavar="DATE", type="date",        action="parse_date",    default=None                        )
       
   593 
       
   594     autoload.add_option(    "--reload",         dest="reload",          help="Force reload lines",
       
   595                                                 action="store_true"                                         )
       
   596 
       
   597     autoload.add_option(    "--reset",          dest="reset",           help="Reset old autload state",
       
   598                                                 action="store_true"                                         )
       
   599 
       
   600     autoload.add_option(    "--ignore-resume",  dest="ignore_resume",   help="Do not try and resume interrupted autoload",  
       
   601                                                 action="store_true"                                         )
       
   602     parser.add_option_group(autoload)
       
   603 
       
   604     # parse
       
   605     options, args = parser.parse_args(argv[1:])
       
   606 
       
   607     # postprocess stuff
       
   608     options._parser = parser
       
   609     options.formatter = config.LOG_FORMATTERS[options.formatter_name](options.timezone, "%H:%M:%S", None, None)
       
   610 
       
   611     # special-case --help
       
   612     if options.help :
       
   613         return cmd_help(options, *args)
       
   614     
       
   615     # must have at least the command argument
       
   616     if not args :
       
   617         raise CommandError("Missing command")
       
   618     
       
   619     # pop command
       
   620     command = args.pop(0)
       
   621     
       
   622     # get func
       
   623     func = globals().get('cmd_%s' % command)
       
   624     
       
   625     # unknown command?
       
   626     if not func :
       
   627         raise CommandError("Unknown command: %s" % command)
       
   628     
       
   629     # call
       
   630     func(options, *args)
       
   631 
       
   632 if __name__ == '__main__' :
       
   633     try :
       
   634         main(sys.argv)
       
   635         sys.exit(0)
       
   636 
       
   637     except CommandError, e :
       
   638         print e
       
   639         sys.exit(1)
       
   640