log_parser.py
changeset 86 645cf9c4441e
parent 83 a34e9f56ddda
child 92 74f6a0b01ddf
--- a/log_parser.py	Tue Feb 10 22:59:52 2009 +0200
+++ b/log_parser.py	Tue Feb 10 23:00:11 2009 +0200
@@ -2,10 +2,18 @@
     Parse log data into log_events
 """
 
+import re
 import datetime
 
-import log_line
-from log_line import LogTypes
+from log_line import LogTypes, LogLine
+
+class LogParseError (Exception) :
+    """
+        Parsing some line failed
+    """
+
+    def __init__ (self, offset, line, message) :
+        super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message))
 
 class LogParser (object) :
     """
@@ -20,10 +28,12 @@
         self.tz = tz
         self.timestamp_fmt = timestamp_fmt
 
-    def parse_lines (self, lines, date=None, starting_offset=None) :
+    def parse_lines (self, channel, lines, date=None, starting_offset=None) :
         """
             Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline.
 
+            Channel is the LogChannel that these lines belong to.
+
             Offset is the starting offset, and may be None to not use it.
             
             Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date
@@ -32,13 +42,114 @@
 
         abstract
 
-
 class IrssiParser (LogParser) :
     """
         A parser for irssi logfiles
     """
 
-    def parse_lines (self, lines, date=None, starting_offset=None) :
+    # subexpression parts
+    _TS = r'(?P<timestamp>\S+)'
+    _NICK = r'(?P<nickname>.+?)'
+    _NICK2 = r'(?P<nickname2>.+?)'
+    _CHAN = r'(?P<channel>.+?)'
+    _USERHOST = r'(?P<username>.*?)@(?P<hostname>.*?)'
+    _MSG = r'(?P<message>.*)'
+
+    # regular expressions for matching lines, by type
+    TYPE_EXPRS = (
+        (   LogTypes.LOG_OPEN,      r'--- Log opened (?P<datetime>.+)'                              ),
+        (   LogTypes.LOG_CLOSE,     r'--- Log closed (?P<datetime>.+)'                              ),
+        (   LogTypes.MSG,           _TS + r' <(?P<flags>.)' + _NICK + '> ' + _MSG                   ),
+        (   LogTypes.NOTICE,        _TS + r' -' + _NICK + ':' + _CHAN + '- ' + _MSG                 ),
+        (   LogTypes.ACTION,        _TS + r'  \* ' + _NICK + ' ' + _MSG                             ),
+        (   LogTypes.JOIN,          _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN                               ), 
+        (   LogTypes.PART,          _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P<message>.*?)\]'       ),
+        (   LogTypes.KICK,          _TS + r' -!- ' + _NICK2 + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P<message>.*?)\]'    ),
+        (   LogTypes.MODE,          _TS + r' -!- mode/' + _CHAN + ' \[(?P<mode>.+?)\] by (?P<nickname>\S+)'                             ),
+        (   LogTypes.NICK,          _TS + r' -!- ' + _NICK + ' is now known as (?P<nickname2>\S+)'                                      ),
+        (   LogTypes.QUIT,          _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P<message>.*?)\]'                     ),
+        (   LogTypes.TOPIC,         _TS + r' -!- ' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P<topic>.*)'                    ),
+
+        (   LogTypes.SELF_NOTICE,   _TS + r' \[notice\(' + _CHAN + '\)\] ' + _MSG                   ),
+        (   LogTypes.SELF_NICK,     _TS + r' -!- You\'re now known as (?P<nickname2>\S+)'           ),
+    )
+
+    # precompile
+    TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS]
+
+    def parse_line (self, channel, line, date, offset=None) :
+        """
+            Parse a single line, and return the resulting LogLine, or None, to ignore the line.
+
+            Uses self.TYPE_REGEXES to do the matching
+        """
+
+        # empty line
+        if not line :
+            return
+
+        # look for match
+        match = type = None
+
+        # test each type
+        for type, regex in self.TYPE_REGEXES :
+            # attempt to match
+            match = regex.match(line)
+            
+            # found, break
+            if match :
+                break
+        
+        # no match found?
+        if not match :
+            raise LogParseError(offset, line, "Line did not match any type")
+        
+        # match groups
+        groups = match.groupdict(None)
+
+        # parse timestamp
+        if 'datetime' in groups :
+            # parse datetime using default asctime() format
+            dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y')
+
+        elif 'timestamp' in groups :
+            # parse timestamp into naive datetime
+            dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt)
+            
+            # override date?
+            if date :
+                dt = dt.replace(year=date.year, month=date.month, day=date.day)
+
+        else :
+            # no timestamp !?
+            raise LogParseError(offset, line, "No timestamp")
+
+        # now localize with timezone
+        dtz = self.tz.localize(dt)
+
+        # source
+        source = (groups.get('nickname'), groups.get('username'), groups.get('hostname'), groups.get('flags'))
+
+        # target
+        target = groups.get('nickname2')
+
+        # data
+        if 'message' in groups :
+            data = groups['message']
+        
+        elif 'mode' in groups :
+            data = groups['mode']
+
+        elif 'topic' in groups :
+            data = groups['topic']
+        
+        else :
+            data = None
+
+        # build+return LogLine
+        return LogLine(channel, offset, type, dtz, source, target, data)
+
+    def parse_lines (self, channel, lines, date=None, starting_offset=None) :
         """
             Parse the given lines, yielding LogEvents. 
         """
@@ -53,45 +164,19 @@
             
             # try and parse
             try :
-                line = self.parse_line(line, date, offset)
-
+                line = self.parse_line(channel, line, date, offset)
+            
+            # passthrough LogParseError's
+            except LogParseError :
+                raise
+            
+            # wrap other errors as LogParseError
             except Exception, e :
-                raise Exception("Parsing line failed: %r@%d: %s" % (line, offset, e))
+                raise LogParseError(line, offset, "Parsing line failed: %s" % e)
             
             else :
                 # yield unless None
                 if line :
                     yield line
 
-    def parse_line (self, line, date, offset=None) :
-        """
-            Parse a single line, and return the resulting LogLine, or None, to ignore the line
-        """
-        
-        # empty line
-        if not line :
-            return
 
-        # status lines
-        elif line.startswith('---') :
-            # XXX: handle these
-            return
-        
-        # normal lines
-        else :
-            # XXX: only parse timestamps for now
-            timestamp, data = line.split(' ', 1)
-            
-            # parse timestamp into naive datetime
-            dt = datetime.datetime.strptime(timestamp, self.timestamp_fmt)
-            
-            # override date?
-            if date :
-                dt = dt.replace(year=date.year, month=date.month, day=date.day)
-            
-            # now localize with timezone
-            dtz = self.tz.localize(dt)
-
-            # build raw event
-            return log_line.LogLine(offset, LogTypes.RAW, dtz, None, data)
-