add a LogSourceDecoder to fallback from utf-8 to latin-1, and improve scripts/search-index.py
--- a/config.py Tue Feb 10 03:48:51 2009 +0200
+++ b/config.py Tue Feb 10 04:27:22 2009 +0200
@@ -5,7 +5,7 @@
import os.path, pytz
from log_parser import IrssiParser
from log_channel import LogChannel
-from log_source import LogDirectory
+from log_source import LogSourceDecoder, LogDirectory
from log_formatter import IrssiFormatter
from channels import ChannelList
import log_formatter
@@ -23,8 +23,11 @@
# timestamp format for logfiles
LOG_TIMESTAMP_FMT = '%H:%M:%S'
-# character set used for logfiles
-LOG_CHARSET = 'utf-8'
+# the decoder used for logfiles
+LOG_DECODER = LogSourceDecoder((
+ ('utf-8', 'strict'),
+ ('latin-1', 'replace'),
+))
# log filename format
LOG_FILENAME_FMT = '%Y-%m-%d'
@@ -36,18 +39,21 @@
# the statically defined channel list
LOG_CHANNELS = ChannelList([
LogChannel('tycoon', "OFTC", "#tycoon",
- LogDirectory(relpath('logs/tycoon'), LOG_TIMEZONE, LOG_PARSER, LOG_CHARSET, LOG_FILENAME_FMT)
+ LogDirectory(relpath('logs/tycoon'), LOG_TIMEZONE, LOG_PARSER, LOG_DECODER, LOG_FILENAME_FMT)
),
LogChannel('openttd', "OFTC", "#openttd",
- LogDirectory(relpath('logs/openttd'), LOG_TIMEZONE, LOG_PARSER, LOG_CHARSET, LOG_FILENAME_FMT)
+ LogDirectory(relpath('logs/openttd'), LOG_TIMEZONE, LOG_PARSER, LOG_DECODER, LOG_FILENAME_FMT)
),
LogChannel('test', "TEST", "#test",
- LogDirectory(relpath('/home/terom/irclogs/test'), LOG_TIMEZONE, LOG_PARSER_FULLTS, LOG_CHARSET, LOG_FILENAME_FMT)
+ LogDirectory(relpath('/home/terom/irclogs/test'), LOG_TIMEZONE, LOG_PARSER_FULLTS, LOG_DECODER, LOG_FILENAME_FMT)
)
])
+# how to handle decode() errors for logfile lines
+LOG_SOURCE_DECODE_ERRORS = 'replace'
+
# date format for URLs
URL_DATE_FMT = '%Y-%m-%d'
@@ -81,10 +87,13 @@
PREF_IMAGE_FONT_SIZE_MAX = 32
# search line count options
-SEARCH_LINE_COUNT_OPTIONS = (
+SEARCH_LINE_COUNT_OPTIONS = (
(50, 50),
(100, 100),
(200, 200),
(None, "∞"),
)
+# search index database path
+SEARCH_INDEX_PATH = 'logs/index'
+
--- a/handlers.py Tue Feb 10 03:48:51 2009 +0200
+++ b/handlers.py Tue Feb 10 04:27:22 2009 +0200
@@ -22,7 +22,7 @@
# our LogSearch thing
# XXX: move elsewhere
import log_search
-search_index = log_search.LogSearchIndex("logs/index", 'r')
+search_index = log_search.LogSearchIndex(config.SEARCH_INDEX_PATH, 'r')
def index (request) :
"""
--- a/log_source.py Tue Feb 10 03:48:51 2009 +0200
+++ b/log_source.py Tue Feb 10 04:27:22 2009 +0200
@@ -6,11 +6,53 @@
import os, errno
import pytz
+import config
+
+class LogSourceDecoder (object) :
+ """
+ Handles decoding of LogSource lines
+ """
+
+ def __init__ (self, encoding_list) :
+ """
+ Will try each of the given (charset, errors) items in turn, until one succeeds
+ """
+
+ self.encoding_list = encoding_list
+
+ def decode (self, line) :
+ """
+ Decode the line of str() text into an unicode object
+ """
+
+ # list of errors encountered
+ error_list = []
+
+ # try each in turn
+ for charset, errors in self.encoding_list :
+ # trap UnicodeDecodeError to try with the next one
+ try :
+ return line.decode(charset, errors)
+
+ except UnicodeDecodeError, e :
+ error_list.append("%s:%s - %s" % (charset, errors, e))
+ continue
+
+ # failure
+ raise UnicodeDecodeError("Failed to decode line: %r: %s" % (line, ', '.join(error_list)))
+
class LogSource (object) :
"""
A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
"""
+ def __init__ (self, decoder) :
+ """
+ Use the given LogSourceDecoder
+ """
+
+ self.decoder = decoder
+
def get_latest (self, count) :
"""
Yield the latest events, up to `count` of them.
@@ -106,7 +148,7 @@
"""
abstract
-
+
class LogFile (object) :
"""
A file containing LogEvents
@@ -114,18 +156,20 @@
XXX: modify to implement LogSource?
"""
- def __init__ (self, path, parser, charset, start_date=None, sep='\n') :
+ def __init__ (self, path, parser, decoder, start_date=None, sep='\n') :
"""
- Open the file at the given path, which contains data with the given charset, as lines separated by the
- given separator. Lines are parsed using the given parser, using the given date as an initial date, see
- LogParser for more info. XXX: currently we assume start_date also for the end of the file
+ Open the file at the given path, which contains lines as separated by the given separator. Lines are
+ decoded using the given LogSourceDecoder, and then parsed using the given parser, using the given date
+ as the initial date for this log's first line.
+
+ XXX: currently we assume start_date also for the end of the file
"""
# store
self.path = path
self.parser = parser
self.start_date = start_date
- self.charset = charset
+ self.decoder = decoder
self.sep = sep
# open
@@ -140,7 +184,7 @@
self.file.seek(0)
# iterate over lines, decoding them as well
- return (line.decode(self.charset).rstrip(self.sep) for line in self.file)
+ return (self.decoder.decode(line.rstrip(self.sep)) for line in self.file)
def read_full (self) :
"""
@@ -279,19 +323,19 @@
A directory containing a series of timestamped LogFiles
"""
- def __init__ (self, path, tz, parser, charset, filename_fmt) :
+ def __init__ (self, path, tz, parser, decoder, filename_fmt) :
"""
Load the logfiles at the given path.
- The files contain data in the given charset, and are named according the the date in the given timezone and
- date format, and will be parsed using the given parser.
+ Decode the file lines using the given decoder, the files are named according the the date in the given
+ timezone and date format, and will be parsed using the given parser.
"""
# store
self.path = path
self.tz = tz
self.parser = parser
- self.charset = charset
+ self.decoder = decoder
self.filename_fmt = filename_fmt
def _get_logfile_datetime (self, dt) :
@@ -322,7 +366,7 @@
try :
if load :
# open+return the LogFile
- return LogFile(path, self.parser, self.charset, d)
+ return LogFile(path, self.parser, self.decoder, d)
else :
# test
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/search-index.py Tue Feb 10 04:27:22 2009 +0200
@@ -0,0 +1,169 @@
+"""
+ Tool for accessing the search index
+"""
+
+# XXX: fix path
+import sys; sys.path.insert(0, '.'); sys.path.insert(0, '..')
+
+import datetime, pytz
+
+# configuration and the LogSearchIndex module
+import config, log_search, channels
+
+def _open_index_and_channel (options, channel_name, open_mode) :
+ """
+ Opens+returns a LogSearchIndex and a LogChannel
+ """
+ # open the LogSearchIndex
+ index = log_search.LogSearchIndex(options.index_path, open_mode)
+
+ # open the channel
+ channel = config.LOG_CHANNELS.lookup(channel_name)
+
+ # return
+ return index, channel
+
+def _load_channel_date (index, options, channel, date) :
+ """
+ Loads the logs for the given date from the channel's LogSource into the given LogSearchIndex
+ """
+
+ if not options.quiet :
+ print "%s %s..." % (channel.id, date.strftime(channel.source.filename_fmt)),
+
+ try :
+ # load lines for date
+ lines = channel.source.get_date(date)
+
+ except Exception, e :
+ if not options.skip_missing :
+ raise
+
+ if not options.quiet :
+ print "Skipped: %s" % (e, )
+
+ else :
+ # insert -> count
+ count = index.insert(channel, lines)
+
+ if not options.quiet :
+ print "OK: %d lines" % count
+
+def cmd_load (options, channel_name, *dates) :
+ """
+ Loads the logs for a specific channel for the given dates (in terms of the channe logs' timezone) into the index
+ """
+
+ # open index/channel
+ index, channel = _open_index_and_channel(options, channel_name, '*' if options.create_index else 'a')
+
+ # handle each date
+ for date_name in dates :
+ try :
+ # parse date
+ date = datetime.datetime.strptime(date_name, '%Y-%m-%d').replace(tzinfo=channel.source.tz)
+
+ except Exception, e :
+ print "[ERROR] Invalid date: %s: %s" % (date_name, e)
+
+ if options.skip_missing :
+ continue
+
+ else :
+ raise
+
+ # load
+ _load_channel_date(index, options, channel, date)
+
+def cmd_load_month (options, channel_name, *months) :
+ """
+ Loads the logs for a specific channel for the given months (in terms of the channel's timezone) into the index
+ """
+
+ # open index/channel
+ index, channel = _open_index_and_channel(options, channel_name, '*' if options.create_index else 'a')
+
+ # handle each date
+ for month_name in months :
+ try :
+ # parse date
+ month = datetime.datetime.strptime(month_name, '%Y-%m').replace(tzinfo=channel.source.tz)
+
+ except Exception, e :
+ print "[ERROR] Invalid date: %s: %s" % (month_name, e)
+
+ if options.skip_missing :
+ continue
+
+ else :
+ raise
+
+ # get the set of days
+ days = channel.source.get_month_days(month)
+
+ print "Loading %d days of logs:" % (len(days))
+
+ # load each day
+ for date in days :
+ # convert to datetime
+ dt = datetime.datetime.combine(date, datetime.time(0)).replace(tzinfo=channel.source.tz)
+
+ # load
+ _load_channel_date(index, options, channel, dt)
+
+def cmd_search (options, channel_name, query) :
+ """
+ Search the index for events on a specific channel with the given query
+ """
+
+ # sanity-check
+ if options.create_index :
+ raise Exception("--create doesn't make sense for 'search'")
+
+ # open index/channel
+ index, channel = _open_index_and_channel(options, channel_name, 'r')
+
+ # search
+ lines = index.search_simple(channel, query)
+
+ # display as plaintext
+ for line in options.formatter.format_txt(lines) :
+ print line
+
+if __name__ == '__main__' :
+ from optparse import OptionParser
+
+ # define parser
+ parser = OptionParser(
+ usage = "%prog [options] <command> [ ... ]",
+ add_help_option = True,
+ )
+
+ # define command-line arguments
+ parser.add_option("-I", "--index", dest="index_path", help="Index database path", metavar="PATH", default="logs/index")
+ parser.add_option("--create", dest="create_index", action="store_true", help="Create index database")
+ parser.add_option("-f", "--formatter", dest="formatter_name", help="LogFormatter to use", default="irssi")
+ parser.add_option("-z", "--timezone", dest="tz_name", help="Timezone for output", metavar="TZ", default="UTC")
+ parser.add_option("--skip-missing", dest="skip_missing", action="store_true", help="Skip missing logfiles")
+ parser.add_option("--quiet", dest="quiet", action="store_true", help="Supress status messages")
+
+ # parse
+ options, args = parser.parse_args()
+
+ # postprocess stuff
+ options.tz = pytz.timezone(options.tz_name)
+ options.formatter = config.LOG_FORMATTERS[options.formatter_name](options.tz, "%H:%M:%S", None, None)
+
+ # pop command
+ if not args :
+ raise Exception("Missing command")
+
+ command = args.pop(0)
+
+ # inspect
+ func = globals()['cmd_%s' % command]
+
+ # call
+ func(options, *args)
+
+
--- a/tools/search.py Tue Feb 10 03:48:51 2009 +0200
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,99 +0,0 @@
-"""
- Tool for accessing the search index
-"""
-
-import sys; sys.path.insert(0, '.'); sys.path.insert(0, '..')
-
-import datetime, pytz
-
-import log_search
-
-def cmd_load (options, channel_name, *dates) :
- """
- Loads the logs for a specific channel for the given dates into the index
- """
-
- import channels
-
- # open the LogSearchIndex
- index = log_search.LogSearchIndex(options.index_path, '*' if options.create_index else 'a')
-
- # open the channel
- channel = channels.channel_list.lookup(channel_name)
-
- for date_name in dates :
- print "%s..." % (date_name, ),
- lines = None
-
- try :
- # parse date
- date = datetime.datetime.strptime(date_name, '%Y-%m-%d').replace(tzinfo=channel.source.tz)
-
- # load lines for date
- lines = channel.source.get_date(date)
-
- except Exception, e :
- print "Skipped: %s" % (e, )
-
- else :
- # insert
- count = index.insert(channel, lines)
-
- print "%d" % count
-
-def cmd_search (options, channel_name, query) :
- """
- Search the index for events on a specific channel with the given query
- """
-
- import channels
-
- assert not options.create_index
-
- # open the LogSearchIndex
- index = log_search.LogSearchIndex(options.index_path, 'r')
-
- # open the channel
- channel = channels.channel_list.lookup(channel_name)
-
- # search
- lines = index.search_simple(channel, query)
-
- # display as plaintext
- for line in options.formatter.format_txt(lines) :
- print line
-
-if __name__ == '__main__' :
- from optparse import OptionParser
- import log_formatter
-
- # define parser
- parser = OptionParser(
- usage = "%prog [options] <command> [ ... ]",
- add_help_option = True,
- )
-
- # define command-line arguments
- parser.add_option("-I", "--index", dest="index_path", help="Index database path", metavar="PATH", default="logs/index")
- parser.add_option("--create", dest="create_index", action="store_true", help="Create index database")
- parser.add_option("-f", "--formatter", dest="formatter_name", help="LogFormatter to use", default="irssi")
- parser.add_option("-z", "--timezone", dest="tz_name", help="Timezone for output", metavar="TZ", default="UTC")
- parser.add_option("--skip-missing", dest="skip_missing", action="store_true", help="Skip missing logfiles")
-
- # parse
- options, args = parser.parse_args()
-
- # postprocess stuff
- options.tz = pytz.timezone(options.tz_name)
- options.formatter = log_formatter.by_name(options.formatter_name)(options.tz)
-
- # pop command
- command = args.pop(0)
-
- # inspect
- func = globals()['cmd_%s' % command]
-
- # call
- func(options, *args)
-
-