qmsk/irclogs/log_parser.py
changeset 140 6db2527b67cf
parent 110 37e67ec434f3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qmsk/irclogs/log_parser.py	Sun Sep 13 01:15:56 2009 +0300
@@ -0,0 +1,233 @@
+"""
+    Parse log data into log_events
+"""
+
+import re
+import datetime
+
+from log_line import LogTypes, LogLine
+
+class LogParseError (Exception) :
+    """
+        Parsing some line failed
+    """
+
+    def __init__ (self, line, offset, message) :
+        super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message))
+
+class LogParser (object) :
+    """
+        Abstract interface
+    """
+
+    def __init__ (self, tz, timestamp_fmt="%H:%M:%S") :
+        """
+            Setup the parser to use the given format for line timestamps, which are of the given timezone
+        """
+
+        self.tz = tz
+        self.timestamp_fmt = timestamp_fmt
+
+    def parse_lines (self, channel, lines, date=None, starting_offset=None) :
+        """
+            Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline.
+
+            Channel is the LogChannel that these lines belong to.
+
+            Offset is the starting offset, and may be None to not use it.
+            
+            Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date
+            information, event timestamps will have a date component of 1900/1/1.
+        """
+
+        abstract
+
+class IrssiParser (LogParser) :
+    """
+        A parser for irssi logfiles
+    """
+    
+    # timestamp prefix, with trailing space
+    _TS = r'(?P<timestamp>[a-zA-Z0-9: ]+[a-zA-Z0-9])\s*'
+
+    # subexpression parts
+    _NICK = r'(?P<nickname>.+?)'
+    _NICK2 = r'(?P<nickname2>.+?)'
+    _TARGET = r'(?P<target>.+?)'
+    _CHAN = r'(?P<channel>.+?)'
+    _CHAN2 = r'(?P<channel2>.+?)'
+    _USERHOST = r'(?P<username>.*?)@(?P<hostname>.*?)'
+    _MSG = r'(?P<message>.*)'
+    _SRV1 = r'(?P<server1>.+?)'
+    _SRV2 = r'(?P<server2>.+?)'
+
+    # regular expressions for matching lines, by type
+    TYPE_EXPRS = (
+        (   LogTypes.LOG_OPEN,      r'--- Log opened (?P<datetime>.+)'                              ),
+        (   LogTypes.LOG_CLOSE,     r'--- Log closed (?P<datetime>.+)'                              ),
+        (   LogTypes.MSG,           _TS + r'<(?P<flags>.)' + _NICK + '> ' + _MSG                   ),
+        (   LogTypes.NOTICE,        _TS + r'-' + _NICK + ':' + _CHAN + '- ' + _MSG                 ),
+        (   LogTypes.ACTION,        _TS + r'\* ' + _NICK + ' ' + _MSG                             ),
+        (   LogTypes.JOIN,          _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN                               ), 
+        (   LogTypes.PART,          _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P<message>.*?)\]'       ),
+        (   LogTypes.KICK,          _TS + r'-!- ' + _TARGET + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P<message>.*?)\]'   ),
+        # XXX: use hostname instead of nickname for ServerMode
+        (   LogTypes.MODE,          _TS + r'-!- (mode|ServerMode)/' + _CHAN + ' \[(?P<mode>.+?)\] by (?P<nickname>\S+)'                ),
+        (   LogTypes.NICK,          _TS + r'-!- ' + _NICK + ' is now known as (?P<target>\S+)'                                         ),
+        (   LogTypes.QUIT,          _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P<message>.*?)\]'                     ),
+        (   LogTypes.TOPIC,         _TS + r'-!- (' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P<topic>.*)|Topic unset by ' + _NICK2 + ' on ' + _CHAN2 + ')'    ),
+
+        (   LogTypes.SELF_NOTICE,   _TS + r'\[notice\(' + _CHAN + '\)\] ' + _MSG                   ),
+        (   LogTypes.SELF_NICK,     _TS + r'-!- You\'re now known as (?P<target>\S+)'              ),
+
+        (   LogTypes.NETSPLIT_START,    _TS + r'-!- Netsplit ' + _SRV1 + ' <-> ' + _SRV2 + ' quits: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more,\S+\))?'),
+        (   LogTypes.NETSPLIT_END,      _TS + r'-!- Netsplit over, joins: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more\))?'              ),
+
+        (   'DAY_CHANGED',          r'--- Day changed (?P<date>.+)'                                 ),
+    )
+
+    # precompile
+    TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS]
+
+    def parse_line (self, channel, line, date, offset=None) :
+        """
+            Parse a single line, and return the resulting LogLine, or None, to ignore the line.
+
+            Uses self.TYPE_REGEXES to do the matching
+        """
+
+        # empty line
+        if not line :
+            return
+
+        # look for match
+        match = type = None
+
+        # test each type
+        for type, regex in self.TYPE_REGEXES :
+            # attempt to match
+            match = regex.match(line)
+            
+            # found, break
+            if match :
+                break
+        
+        # no match found?
+        if not match :
+            raise LogParseError(line, offset, "Line did not match any type")
+        
+        # match groups
+        groups = match.groupdict(None)
+
+        # parse timestamp
+        if 'datetime' in groups :
+            # parse datetime using default asctime() format
+            dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y')
+
+        elif 'timestamp' in groups :
+            # parse timestamp into naive datetime
+            dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt)
+            
+            # override date?
+            if date :
+                dt = dt.replace(year=date.year, month=date.month, day=date.day)
+
+        elif 'date' in groups :
+            # parse date-only datetime
+            dt = datetime.datetime.strptime(groups['date'], '%a %b %d %Y')
+
+        else :
+            # no timestamp !?
+            raise LogParseError(line, offset, "No timestamp")
+
+        # now localize with timezone
+        dtz = self.tz.localize(dt)
+
+        # channel, currently unused
+        channel_name = (groups.get('channel') or groups.get('channel2'))
+
+        # source
+        if 'server1' in groups :
+            source = (None, None, groups.get('server1'), None)
+
+        else :
+            source = (groups.get('nickname') or groups.get('nickname2'), groups.get('username'), groups.get('hostname'), groups.get('flags'))
+
+        # target
+        if 'server2' in groups :
+            target = groups.get('server2')
+
+        else :
+            target = groups.get('target')
+
+        # data
+        if 'message' in groups :
+            data = groups['message']
+        
+        elif 'mode' in groups :
+            data = groups['mode']
+
+        elif 'topic' in groups :
+            data = groups['topic']
+        
+        elif 'nick_list' in groups :
+            # split into components
+            list = groups['nick_list'].split(', ')
+            
+            # additional count?
+            if 'count' in groups and groups['count'] :
+                list.append('+%d' % int(groups['count']))
+            
+            # join
+            data = ' '.join(list)
+
+        else :
+            data = None
+        
+        # custom types?
+        if type == 'DAY_CHANGED' :
+            # new date
+            date = dtz
+        
+        else :
+            # build+return (date, LogLine)
+            return date, LogLine(channel, offset, type, dtz, source, target, data)
+
+    def parse_lines (self, channel, lines, date=None, starting_offset=None) :
+        """
+            Parse the given lines, yielding LogEvents. 
+        """
+
+        for offset, line in enumerate(lines) :
+            # offset?
+            if starting_offset :
+                offset = starting_offset + offset
+
+            else :
+                offset = None
+            
+            # try and parse
+            try :
+                # get None or (date, line)
+                line_info = self.parse_line(channel, line, date, offset)
+
+           # passthrough LogParseError's
+            except LogParseError :
+                raise
+            
+            # wrap other errors as LogParseError
+            except Exception, e :
+                raise LogParseError(line, offset, "Parsing line failed: %s" % e)
+            
+            else :
+                # nothing?
+                if not line_info :
+                    continue
+                
+                # unpack, update date
+                date, line = line_info
+                
+                # yield
+                yield line
+
+