log_source.py
changeset 82 afd3120ec71e
parent 81 745032a57803
child 83 a34e9f56ddda
--- a/log_source.py	Tue Feb 10 03:48:51 2009 +0200
+++ b/log_source.py	Tue Feb 10 04:27:22 2009 +0200
@@ -6,11 +6,53 @@
 import os, errno
 import pytz
 
+import config
+
+class LogSourceDecoder (object) :
+    """
+        Handles decoding of LogSource lines
+    """
+
+    def __init__ (self, encoding_list) :
+        """
+            Will try each of the given (charset, errors) items in turn, until one succeeds
+        """
+
+        self.encoding_list = encoding_list
+    
+    def decode (self, line) :
+        """
+            Decode the line of str() text into an unicode object
+        """
+        
+        # list of errors encountered
+        error_list = []
+        
+        # try each in turn
+        for charset, errors in self.encoding_list :
+            # trap UnicodeDecodeError to try with the next one
+            try :
+                return line.decode(charset, errors)
+
+            except UnicodeDecodeError, e :
+                error_list.append("%s:%s - %s" % (charset, errors, e))
+                continue
+
+        # failure
+        raise UnicodeDecodeError("Failed to decode line: %r: %s" % (line, ', '.join(error_list)))
+
 class LogSource (object) :
     """
         A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
     """
     
+    def __init__ (self, decoder) :
+        """
+            Use the given LogSourceDecoder
+        """
+
+        self.decoder = decoder
+    
     def get_latest (self, count) :
         """
             Yield the latest events, up to `count` of them.
@@ -106,7 +148,7 @@
         """
 
         abstract
- 
+
 class LogFile (object) :
     """
         A file containing LogEvents
@@ -114,18 +156,20 @@
         XXX: modify to implement LogSource?
     """
 
-    def __init__ (self, path, parser, charset, start_date=None, sep='\n') :
+    def __init__ (self, path, parser, decoder, start_date=None, sep='\n') :
         """
-            Open the file at the given path, which contains data with the given charset, as lines separated by the
-            given separator. Lines are parsed using the given parser, using the given date as an initial date, see
-            LogParser for more info. XXX: currently we assume start_date also for the end of the file
+            Open the file at the given path, which contains lines as separated by the given separator. Lines are
+            decoded using the given LogSourceDecoder, and then parsed using the given parser, using the given date
+            as the initial date for this log's first line.
+            
+            XXX: currently we assume start_date also for the end of the file
         """
         
         # store
         self.path = path
         self.parser = parser
         self.start_date = start_date
-        self.charset = charset
+        self.decoder = decoder
         self.sep = sep
 
         # open
@@ -140,7 +184,7 @@
         self.file.seek(0)
 
         # iterate over lines, decoding them as well
-        return (line.decode(self.charset).rstrip(self.sep) for line in self.file)
+        return (self.decoder.decode(line.rstrip(self.sep)) for line in self.file)
     
     def read_full (self) :
         """
@@ -279,19 +323,19 @@
         A directory containing a series of timestamped LogFiles
     """
 
-    def __init__ (self, path, tz, parser, charset, filename_fmt) :
+    def __init__ (self, path, tz, parser, decoder, filename_fmt) :
         """
             Load the logfiles at the given path.
             
-            The files contain data in the given charset, and are named according the the date in the given timezone and
-            date format, and will be parsed using the given parser.
+            Decode the file lines using the given decoder, the files are named according the the date in the given
+            timezone and date format, and will be parsed using the given parser.
         """
 
         # store
         self.path = path
         self.tz = tz
         self.parser = parser
-        self.charset = charset
+        self.decoder = decoder
         self.filename_fmt = filename_fmt
 
     def _get_logfile_datetime (self, dt) :
@@ -322,7 +366,7 @@
         try :
             if load :
                 # open+return the LogFile
-                return LogFile(path, self.parser, self.charset, d)
+                return LogFile(path, self.parser, self.decoder, d)
             
             else :
                 # test