add a LogSourceDecoder to fallback from utf-8 to latin-1, and improve scripts/search-index.py
authorTero Marttila <terom@fixme.fi>
Tue, 10 Feb 2009 04:27:22 +0200
changeset 82 afd3120ec71e
parent 81 745032a57803
child 83 a34e9f56ddda
add a LogSourceDecoder to fallback from utf-8 to latin-1, and improve scripts/search-index.py
config.py
handlers.py
log_source.py
scripts/search-index.py
tools/search.py
--- a/config.py	Tue Feb 10 03:48:51 2009 +0200
+++ b/config.py	Tue Feb 10 04:27:22 2009 +0200
@@ -5,7 +5,7 @@
 import os.path, pytz
 from log_parser import IrssiParser
 from log_channel import LogChannel
-from log_source import LogDirectory
+from log_source import LogSourceDecoder, LogDirectory
 from log_formatter import IrssiFormatter
 from channels import ChannelList
 import log_formatter
@@ -23,8 +23,11 @@
 # timestamp format for logfiles
 LOG_TIMESTAMP_FMT               = '%H:%M:%S'
 
-# character set used for logfiles
-LOG_CHARSET                     = 'utf-8'
+# the decoder used for logfiles
+LOG_DECODER                     = LogSourceDecoder((
+    ('utf-8',       'strict'),
+    ('latin-1',     'replace'),
+))
 
 # log filename format
 LOG_FILENAME_FMT                = '%Y-%m-%d'
@@ -36,18 +39,21 @@
 # the statically defined channel list
 LOG_CHANNELS                    = ChannelList([
     LogChannel('tycoon',    "OFTC",     "#tycoon", 
-        LogDirectory(relpath('logs/tycoon'),    LOG_TIMEZONE, LOG_PARSER, LOG_CHARSET, LOG_FILENAME_FMT)
+        LogDirectory(relpath('logs/tycoon'),    LOG_TIMEZONE, LOG_PARSER, LOG_DECODER, LOG_FILENAME_FMT)
     ),
 
     LogChannel('openttd',   "OFTC",     "#openttd", 
-        LogDirectory(relpath('logs/openttd'),   LOG_TIMEZONE, LOG_PARSER, LOG_CHARSET, LOG_FILENAME_FMT)
+        LogDirectory(relpath('logs/openttd'),   LOG_TIMEZONE, LOG_PARSER, LOG_DECODER, LOG_FILENAME_FMT)
     ),
 
     LogChannel('test',      "TEST",     "#test",
-        LogDirectory(relpath('/home/terom/irclogs/test'),  LOG_TIMEZONE, LOG_PARSER_FULLTS, LOG_CHARSET, LOG_FILENAME_FMT)
+        LogDirectory(relpath('/home/terom/irclogs/test'),  LOG_TIMEZONE, LOG_PARSER_FULLTS, LOG_DECODER, LOG_FILENAME_FMT)
     )
 ])
 
+# how to handle decode() errors for logfile lines
+LOG_SOURCE_DECODE_ERRORS        = 'replace'
+
 # date format for URLs
 URL_DATE_FMT                    = '%Y-%m-%d'
 
@@ -81,10 +87,13 @@
 PREF_IMAGE_FONT_SIZE_MAX        = 32
 
 # search line count options
-SEARCH_LINE_COUNT_OPTIONS =     (
+SEARCH_LINE_COUNT_OPTIONS       = (
     (50,    50), 
     (100,   100), 
     (200,   200), 
     (None,  "&#8734;"),
 )
 
+# search index database path
+SEARCH_INDEX_PATH               = 'logs/index'
+
--- a/handlers.py	Tue Feb 10 03:48:51 2009 +0200
+++ b/handlers.py	Tue Feb 10 04:27:22 2009 +0200
@@ -22,7 +22,7 @@
 # our LogSearch thing
 # XXX: move elsewhere
 import log_search
-search_index = log_search.LogSearchIndex("logs/index", 'r')
+search_index = log_search.LogSearchIndex(config.SEARCH_INDEX_PATH, 'r')
 
 def index (request) :
     """
--- a/log_source.py	Tue Feb 10 03:48:51 2009 +0200
+++ b/log_source.py	Tue Feb 10 04:27:22 2009 +0200
@@ -6,11 +6,53 @@
 import os, errno
 import pytz
 
+import config
+
+class LogSourceDecoder (object) :
+    """
+        Handles decoding of LogSource lines
+    """
+
+    def __init__ (self, encoding_list) :
+        """
+            Will try each of the given (charset, errors) items in turn, until one succeeds
+        """
+
+        self.encoding_list = encoding_list
+    
+    def decode (self, line) :
+        """
+            Decode the line of str() text into an unicode object
+        """
+        
+        # list of errors encountered
+        error_list = []
+        
+        # try each in turn
+        for charset, errors in self.encoding_list :
+            # trap UnicodeDecodeError to try with the next one
+            try :
+                return line.decode(charset, errors)
+
+            except UnicodeDecodeError, e :
+                error_list.append("%s:%s - %s" % (charset, errors, e))
+                continue
+
+        # failure
+        raise UnicodeDecodeError("Failed to decode line: %r: %s" % (line, ', '.join(error_list)))
+
 class LogSource (object) :
     """
         A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
     """
     
+    def __init__ (self, decoder) :
+        """
+            Use the given LogSourceDecoder
+        """
+
+        self.decoder = decoder
+    
     def get_latest (self, count) :
         """
             Yield the latest events, up to `count` of them.
@@ -106,7 +148,7 @@
         """
 
         abstract
- 
+
 class LogFile (object) :
     """
         A file containing LogEvents
@@ -114,18 +156,20 @@
         XXX: modify to implement LogSource?
     """
 
-    def __init__ (self, path, parser, charset, start_date=None, sep='\n') :
+    def __init__ (self, path, parser, decoder, start_date=None, sep='\n') :
         """
-            Open the file at the given path, which contains data with the given charset, as lines separated by the
-            given separator. Lines are parsed using the given parser, using the given date as an initial date, see
-            LogParser for more info. XXX: currently we assume start_date also for the end of the file
+            Open the file at the given path, which contains lines as separated by the given separator. Lines are
+            decoded using the given LogSourceDecoder, and then parsed using the given parser, using the given date
+            as the initial date for this log's first line.
+            
+            XXX: currently we assume start_date also for the end of the file
         """
         
         # store
         self.path = path
         self.parser = parser
         self.start_date = start_date
-        self.charset = charset
+        self.decoder = decoder
         self.sep = sep
 
         # open
@@ -140,7 +184,7 @@
         self.file.seek(0)
 
         # iterate over lines, decoding them as well
-        return (line.decode(self.charset).rstrip(self.sep) for line in self.file)
+        return (self.decoder.decode(line.rstrip(self.sep)) for line in self.file)
     
     def read_full (self) :
         """
@@ -279,19 +323,19 @@
         A directory containing a series of timestamped LogFiles
     """
 
-    def __init__ (self, path, tz, parser, charset, filename_fmt) :
+    def __init__ (self, path, tz, parser, decoder, filename_fmt) :
         """
             Load the logfiles at the given path.
             
-            The files contain data in the given charset, and are named according the the date in the given timezone and
-            date format, and will be parsed using the given parser.
+            Decode the file lines using the given decoder, the files are named according the the date in the given
+            timezone and date format, and will be parsed using the given parser.
         """
 
         # store
         self.path = path
         self.tz = tz
         self.parser = parser
-        self.charset = charset
+        self.decoder = decoder
         self.filename_fmt = filename_fmt
 
     def _get_logfile_datetime (self, dt) :
@@ -322,7 +366,7 @@
         try :
             if load :
                 # open+return the LogFile
-                return LogFile(path, self.parser, self.charset, d)
+                return LogFile(path, self.parser, self.decoder, d)
             
             else :
                 # test
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/search-index.py	Tue Feb 10 04:27:22 2009 +0200
@@ -0,0 +1,169 @@
+"""
+    Tool for accessing the search index
+"""
+
+# XXX: fix path
+import sys; sys.path.insert(0, '.'); sys.path.insert(0, '..')
+
+import datetime, pytz
+
+# configuration and the LogSearchIndex module
+import config, log_search, channels
+
+def _open_index_and_channel (options, channel_name, open_mode) :
+    """
+        Opens+returns a LogSearchIndex and a LogChannel
+    """
+    # open the LogSearchIndex
+    index = log_search.LogSearchIndex(options.index_path, open_mode)
+
+    # open the channel
+    channel = config.LOG_CHANNELS.lookup(channel_name)
+    
+    # return
+    return index, channel
+
+def _load_channel_date (index, options, channel, date) :
+    """
+        Loads the logs for the given date from the channel's LogSource into the given LogSearchIndex
+    """
+
+    if not options.quiet :
+        print "%s %s..." % (channel.id, date.strftime(channel.source.filename_fmt)),
+        
+    try :
+        # load lines for date
+        lines = channel.source.get_date(date)
+    
+    except Exception, e :
+        if not options.skip_missing :
+            raise
+            
+        if not options.quiet :
+            print "Skipped: %s" % (e, )
+    
+    else :
+        # insert -> count
+        count = index.insert(channel, lines)
+
+        if not options.quiet :
+            print "OK: %d lines" % count
+
+def cmd_load (options, channel_name, *dates) :
+    """
+        Loads the logs for a specific channel for the given dates (in terms of the channe logs' timezone) into the index
+    """
+
+    # open index/channel
+    index, channel = _open_index_and_channel(options, channel_name, '*' if options.create_index else 'a')
+    
+    # handle each date
+    for date_name in dates :
+        try :
+            # parse date
+            date = datetime.datetime.strptime(date_name, '%Y-%m-%d').replace(tzinfo=channel.source.tz)
+
+        except Exception, e :
+            print "[ERROR] Invalid date: %s: %s" % (date_name, e)
+
+            if options.skip_missing :
+                continue
+
+            else :
+                raise
+        
+        # load
+        _load_channel_date(index, options, channel, date)
+
+def cmd_load_month (options, channel_name, *months) :
+    """
+        Loads the logs for a specific channel for the given months (in terms of the channel's timezone) into the index
+    """
+
+    # open index/channel
+    index, channel = _open_index_and_channel(options, channel_name, '*' if options.create_index else 'a')
+    
+    # handle each date
+    for month_name in months :
+        try :
+            # parse date
+            month = datetime.datetime.strptime(month_name, '%Y-%m').replace(tzinfo=channel.source.tz)
+
+        except Exception, e :
+            print "[ERROR] Invalid date: %s: %s" % (month_name, e)
+
+            if options.skip_missing :
+                continue
+
+            else :
+                raise
+        
+        # get the set of days
+        days = channel.source.get_month_days(month)
+
+        print "Loading %d days of logs:" % (len(days))
+
+        # load each day
+        for date in days :
+            # convert to datetime
+            dt = datetime.datetime.combine(date, datetime.time(0)).replace(tzinfo=channel.source.tz)
+            
+            # load
+            _load_channel_date(index, options, channel, dt)
+
+def cmd_search (options, channel_name, query) :
+    """
+        Search the index for events on a specific channel with the given query
+    """
+    
+    # sanity-check
+    if options.create_index :
+        raise Exception("--create doesn't make sense for 'search'")
+    
+    # open index/channel
+    index, channel = _open_index_and_channel(options, channel_name, 'r')
+    
+    # search
+    lines = index.search_simple(channel, query)
+    
+    # display as plaintext
+    for line in options.formatter.format_txt(lines) :
+        print line
+
+if __name__ == '__main__' :
+    from optparse import OptionParser
+    
+    # define parser
+    parser = OptionParser(
+        usage           = "%prog [options] <command> [ ... ]",
+        add_help_option = True,
+    )
+
+    # define command-line arguments
+    parser.add_option("-I", "--index", dest="index_path", help="Index database path", metavar="PATH", default="logs/index")
+    parser.add_option("--create", dest="create_index", action="store_true", help="Create index database")
+    parser.add_option("-f", "--formatter", dest="formatter_name", help="LogFormatter to use", default="irssi")
+    parser.add_option("-z", "--timezone", dest="tz_name", help="Timezone for output", metavar="TZ", default="UTC")
+    parser.add_option("--skip-missing", dest="skip_missing", action="store_true", help="Skip missing logfiles")
+    parser.add_option("--quiet", dest="quiet", action="store_true", help="Supress status messages")
+
+    # parse
+    options, args = parser.parse_args()
+
+    # postprocess stuff
+    options.tz = pytz.timezone(options.tz_name)
+    options.formatter = config.LOG_FORMATTERS[options.formatter_name](options.tz, "%H:%M:%S", None, None)
+    
+    # pop command
+    if not args :
+        raise Exception("Missing command")
+
+    command = args.pop(0)
+
+    # inspect
+    func = globals()['cmd_%s' % command]
+    
+    # call
+    func(options, *args)
+
+
--- a/tools/search.py	Tue Feb 10 03:48:51 2009 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,99 +0,0 @@
-"""
-    Tool for accessing the search index
-"""
-
-import sys; sys.path.insert(0, '.'); sys.path.insert(0, '..')
-
-import datetime, pytz
-
-import log_search
-
-def cmd_load (options, channel_name, *dates) :
-    """
-        Loads the logs for a specific channel for the given dates into the index
-    """
-
-    import channels
-    
-    # open the LogSearchIndex
-    index = log_search.LogSearchIndex(options.index_path, '*' if options.create_index else 'a')
-
-    # open the channel
-    channel = channels.channel_list.lookup(channel_name)
-    
-    for date_name in dates :
-        print "%s..." % (date_name, ),
-        lines = None
-        
-        try :
-            # parse date
-            date = datetime.datetime.strptime(date_name, '%Y-%m-%d').replace(tzinfo=channel.source.tz)
-
-            # load lines for date
-            lines = channel.source.get_date(date)
-        
-        except Exception, e :
-            print "Skipped: %s" % (e, )
-        
-        else :
-            # insert
-            count = index.insert(channel, lines)
-
-            print "%d" % count
-
-def cmd_search (options, channel_name, query) :
-    """
-        Search the index for events on a specific channel with the given query
-    """
-
-    import channels
-
-    assert not options.create_index
-    
-    # open the LogSearchIndex
-    index = log_search.LogSearchIndex(options.index_path, 'r')
-
-    # open the channel
-    channel = channels.channel_list.lookup(channel_name)
-    
-    # search
-    lines = index.search_simple(channel, query)
-    
-    # display as plaintext
-    for line in options.formatter.format_txt(lines) :
-        print line
-
-if __name__ == '__main__' :
-    from optparse import OptionParser
-    import log_formatter
-    
-    # define parser
-    parser = OptionParser(
-        usage           = "%prog [options] <command> [ ... ]",
-        add_help_option = True,
-    )
-
-    # define command-line arguments
-    parser.add_option("-I", "--index", dest="index_path", help="Index database path", metavar="PATH", default="logs/index")
-    parser.add_option("--create", dest="create_index", action="store_true", help="Create index database")
-    parser.add_option("-f", "--formatter", dest="formatter_name", help="LogFormatter to use", default="irssi")
-    parser.add_option("-z", "--timezone", dest="tz_name", help="Timezone for output", metavar="TZ", default="UTC")
-    parser.add_option("--skip-missing", dest="skip_missing", action="store_true", help="Skip missing logfiles")
-
-    # parse
-    options, args = parser.parse_args()
-
-    # postprocess stuff
-    options.tz = pytz.timezone(options.tz_name)
-    options.formatter = log_formatter.by_name(options.formatter_name)(options.tz)
-    
-    # pop command
-    command = args.pop(0)
-
-    # inspect
-    func = globals()['cmd_%s' % command]
-    
-    # call
-    func(options, *args)
-
-