log_parser.py
author Tero Marttila <terom@fixme.fi>
Thu, 12 Feb 2009 00:31:34 +0200
changeset 116 81da986f6ed5
parent 110 37e67ec434f3
permissions -rw-r--r--
fix wrong timezone for channel_date
"""
    Parse log data into log_events
"""

import re
import datetime

from log_line import LogTypes, LogLine

class LogParseError (Exception) :
    """
        Parsing some line failed
    """

    def __init__ (self, line, offset, message) :
        super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message))

class LogParser (object) :
    """
        Abstract interface
    """

    def __init__ (self, tz, timestamp_fmt="%H:%M:%S") :
        """
            Setup the parser to use the given format for line timestamps, which are of the given timezone
        """

        self.tz = tz
        self.timestamp_fmt = timestamp_fmt

    def parse_lines (self, channel, lines, date=None, starting_offset=None) :
        """
            Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline.

            Channel is the LogChannel that these lines belong to.

            Offset is the starting offset, and may be None to not use it.
            
            Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date
            information, event timestamps will have a date component of 1900/1/1.
        """

        abstract

class IrssiParser (LogParser) :
    """
        A parser for irssi logfiles
    """
    
    # timestamp prefix, with trailing space
    _TS = r'(?P<timestamp>[a-zA-Z0-9: ]+[a-zA-Z0-9])\s*'

    # subexpression parts
    _NICK = r'(?P<nickname>.+?)'
    _NICK2 = r'(?P<nickname2>.+?)'
    _TARGET = r'(?P<target>.+?)'
    _CHAN = r'(?P<channel>.+?)'
    _CHAN2 = r'(?P<channel2>.+?)'
    _USERHOST = r'(?P<username>.*?)@(?P<hostname>.*?)'
    _MSG = r'(?P<message>.*)'
    _SRV1 = r'(?P<server1>.+?)'
    _SRV2 = r'(?P<server2>.+?)'

    # regular expressions for matching lines, by type
    TYPE_EXPRS = (
        (   LogTypes.LOG_OPEN,      r'--- Log opened (?P<datetime>.+)'                              ),
        (   LogTypes.LOG_CLOSE,     r'--- Log closed (?P<datetime>.+)'                              ),
        (   LogTypes.MSG,           _TS + r'<(?P<flags>.)' + _NICK + '> ' + _MSG                   ),
        (   LogTypes.NOTICE,        _TS + r'-' + _NICK + ':' + _CHAN + '- ' + _MSG                 ),
        (   LogTypes.ACTION,        _TS + r'\* ' + _NICK + ' ' + _MSG                             ),
        (   LogTypes.JOIN,          _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN                               ), 
        (   LogTypes.PART,          _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P<message>.*?)\]'       ),
        (   LogTypes.KICK,          _TS + r'-!- ' + _TARGET + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P<message>.*?)\]'   ),
        # XXX: use hostname instead of nickname for ServerMode
        (   LogTypes.MODE,          _TS + r'-!- (mode|ServerMode)/' + _CHAN + ' \[(?P<mode>.+?)\] by (?P<nickname>\S+)'                ),
        (   LogTypes.NICK,          _TS + r'-!- ' + _NICK + ' is now known as (?P<target>\S+)'                                         ),
        (   LogTypes.QUIT,          _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P<message>.*?)\]'                     ),
        (   LogTypes.TOPIC,         _TS + r'-!- (' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P<topic>.*)|Topic unset by ' + _NICK2 + ' on ' + _CHAN2 + ')'    ),

        (   LogTypes.SELF_NOTICE,   _TS + r'\[notice\(' + _CHAN + '\)\] ' + _MSG                   ),
        (   LogTypes.SELF_NICK,     _TS + r'-!- You\'re now known as (?P<target>\S+)'              ),

        (   LogTypes.NETSPLIT_START,    _TS + r'-!- Netsplit ' + _SRV1 + ' <-> ' + _SRV2 + ' quits: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more,\S+\))?'),
        (   LogTypes.NETSPLIT_END,      _TS + r'-!- Netsplit over, joins: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more\))?'              ),

        (   'DAY_CHANGED',          r'--- Day changed (?P<date>.+)'                                 ),
    )

    # precompile
    TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS]

    def parse_line (self, channel, line, date, offset=None) :
        """
            Parse a single line, and return the resulting LogLine, or None, to ignore the line.

            Uses self.TYPE_REGEXES to do the matching
        """

        # empty line
        if not line :
            return

        # look for match
        match = type = None

        # test each type
        for type, regex in self.TYPE_REGEXES :
            # attempt to match
            match = regex.match(line)
            
            # found, break
            if match :
                break
        
        # no match found?
        if not match :
            raise LogParseError(line, offset, "Line did not match any type")
        
        # match groups
        groups = match.groupdict(None)

        # parse timestamp
        if 'datetime' in groups :
            # parse datetime using default asctime() format
            dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y')

        elif 'timestamp' in groups :
            # parse timestamp into naive datetime
            dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt)
            
            # override date?
            if date :
                dt = dt.replace(year=date.year, month=date.month, day=date.day)

        elif 'date' in groups :
            # parse date-only datetime
            dt = datetime.datetime.strptime(groups['date'], '%a %b %d %Y')

        else :
            # no timestamp !?
            raise LogParseError(line, offset, "No timestamp")

        # now localize with timezone
        dtz = self.tz.localize(dt)

        # channel, currently unused
        channel_name = (groups.get('channel') or groups.get('channel2'))

        # source
        if 'server1' in groups :
            source = (None, None, groups.get('server1'), None)

        else :
            source = (groups.get('nickname') or groups.get('nickname2'), groups.get('username'), groups.get('hostname'), groups.get('flags'))

        # target
        if 'server2' in groups :
            target = groups.get('server2')

        else :
            target = groups.get('target')

        # data
        if 'message' in groups :
            data = groups['message']
        
        elif 'mode' in groups :
            data = groups['mode']

        elif 'topic' in groups :
            data = groups['topic']
        
        elif 'nick_list' in groups :
            # split into components
            list = groups['nick_list'].split(', ')
            
            # additional count?
            if 'count' in groups and groups['count'] :
                list.append('+%d' % int(groups['count']))
            
            # join
            data = ' '.join(list)

        else :
            data = None
        
        # custom types?
        if type == 'DAY_CHANGED' :
            # new date
            date = dtz
        
        else :
            # build+return (date, LogLine)
            return date, LogLine(channel, offset, type, dtz, source, target, data)

    def parse_lines (self, channel, lines, date=None, starting_offset=None) :
        """
            Parse the given lines, yielding LogEvents. 
        """

        for offset, line in enumerate(lines) :
            # offset?
            if starting_offset :
                offset = starting_offset + offset

            else :
                offset = None
            
            # try and parse
            try :
                # get None or (date, line)
                line_info = self.parse_line(channel, line, date, offset)

           # passthrough LogParseError's
            except LogParseError :
                raise
            
            # wrap other errors as LogParseError
            except Exception, e :
                raise LogParseError(line, offset, "Parsing line failed: %s" % e)
            
            else :
                # nothing?
                if not line_info :
                    continue
                
                # unpack, update date
                date, line = line_info
                
                # yield
                yield line