diff -r 745032a57803 -r afd3120ec71e log_source.py --- a/log_source.py Tue Feb 10 03:48:51 2009 +0200 +++ b/log_source.py Tue Feb 10 04:27:22 2009 +0200 @@ -6,11 +6,53 @@ import os, errno import pytz +import config + +class LogSourceDecoder (object) : + """ + Handles decoding of LogSource lines + """ + + def __init__ (self, encoding_list) : + """ + Will try each of the given (charset, errors) items in turn, until one succeeds + """ + + self.encoding_list = encoding_list + + def decode (self, line) : + """ + Decode the line of str() text into an unicode object + """ + + # list of errors encountered + error_list = [] + + # try each in turn + for charset, errors in self.encoding_list : + # trap UnicodeDecodeError to try with the next one + try : + return line.decode(charset, errors) + + except UnicodeDecodeError, e : + error_list.append("%s:%s - %s" % (charset, errors, e)) + continue + + # failure + raise UnicodeDecodeError("Failed to decode line: %r: %s" % (line, ', '.join(error_list))) + class LogSource (object) : """ A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events """ + def __init__ (self, decoder) : + """ + Use the given LogSourceDecoder + """ + + self.decoder = decoder + def get_latest (self, count) : """ Yield the latest events, up to `count` of them. @@ -106,7 +148,7 @@ """ abstract - + class LogFile (object) : """ A file containing LogEvents @@ -114,18 +156,20 @@ XXX: modify to implement LogSource? """ - def __init__ (self, path, parser, charset, start_date=None, sep='\n') : + def __init__ (self, path, parser, decoder, start_date=None, sep='\n') : """ - Open the file at the given path, which contains data with the given charset, as lines separated by the - given separator. Lines are parsed using the given parser, using the given date as an initial date, see - LogParser for more info. XXX: currently we assume start_date also for the end of the file + Open the file at the given path, which contains lines as separated by the given separator. Lines are + decoded using the given LogSourceDecoder, and then parsed using the given parser, using the given date + as the initial date for this log's first line. + + XXX: currently we assume start_date also for the end of the file """ # store self.path = path self.parser = parser self.start_date = start_date - self.charset = charset + self.decoder = decoder self.sep = sep # open @@ -140,7 +184,7 @@ self.file.seek(0) # iterate over lines, decoding them as well - return (line.decode(self.charset).rstrip(self.sep) for line in self.file) + return (self.decoder.decode(line.rstrip(self.sep)) for line in self.file) def read_full (self) : """ @@ -279,19 +323,19 @@ A directory containing a series of timestamped LogFiles """ - def __init__ (self, path, tz, parser, charset, filename_fmt) : + def __init__ (self, path, tz, parser, decoder, filename_fmt) : """ Load the logfiles at the given path. - The files contain data in the given charset, and are named according the the date in the given timezone and - date format, and will be parsed using the given parser. + Decode the file lines using the given decoder, the files are named according the the date in the given + timezone and date format, and will be parsed using the given parser. """ # store self.path = path self.tz = tz self.parser = parser - self.charset = charset + self.decoder = decoder self.filename_fmt = filename_fmt def _get_logfile_datetime (self, dt) : @@ -322,7 +366,7 @@ try : if load : # open+return the LogFile - return LogFile(path, self.parser, self.charset, d) + return LogFile(path, self.parser, self.decoder, d) else : # test