log_parser.py
author Tero Marttila <terom@fixme.fi>
Wed, 11 Feb 2009 04:04:55 +0200
changeset 102 e396613bc873
parent 97 6165f1ba458d
child 103 0e829e6275dc
permissions -rw-r--r--
have LogDirectory use utils.mtime instead of os.stat
"""
    Parse log data into log_events
"""

import re
import datetime

from log_line import LogTypes, LogLine

class LogParseError (Exception) :
    """
        Parsing some line failed
    """

    def __init__ (self, line, offset, message) :
        super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message))

class LogParser (object) :
    """
        Abstract interface
    """

    def __init__ (self, tz, timestamp_fmt="%H:%M:%S") :
        """
            Setup the parser to use the given format for line timestamps, which are of the given timezone
        """

        self.tz = tz
        self.timestamp_fmt = timestamp_fmt

    def parse_lines (self, channel, lines, date=None, starting_offset=None) :
        """
            Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline.

            Channel is the LogChannel that these lines belong to.

            Offset is the starting offset, and may be None to not use it.
            
            Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date
            information, event timestamps will have a date component of 1900/1/1.
        """

        abstract

class IrssiParser (LogParser) :
    """
        A parser for irssi logfiles
    """

    # subexpression parts
    _TS = r'(?P<timestamp>\S+)'
    _NICK = r'(?P<nickname>.+?)'
    _NICK2 = r'(?P<nickname2>.+?)'
    _TARGET = r'(?P<target>.+?)'
    _CHAN = r'(?P<channel>.+?)'
    _CHAN2 = r'(?P<channel2>.+?)'
    _USERHOST = r'(?P<username>.*?)@(?P<hostname>.*?)'
    _MSG = r'(?P<message>.*)'
    _SRV1 = r'(?P<server1>.+?)'
    _SRV2 = r'(?P<server2>.+?)'

    # regular expressions for matching lines, by type
    TYPE_EXPRS = (
        (   LogTypes.LOG_OPEN,      r'--- Log opened (?P<datetime>.+)'                              ),
        (   LogTypes.LOG_CLOSE,     r'--- Log closed (?P<datetime>.+)'                              ),
        (   LogTypes.MSG,           _TS + r' <(?P<flags>.)' + _NICK + '> ' + _MSG                   ),
        (   LogTypes.NOTICE,        _TS + r' -' + _NICK + ':' + _CHAN + '- ' + _MSG                 ),
        (   LogTypes.ACTION,        _TS + r'  \* ' + _NICK + ' ' + _MSG                             ),
        (   LogTypes.JOIN,          _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN                               ), 
        (   LogTypes.PART,          _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P<message>.*?)\]'       ),
        (   LogTypes.KICK,          _TS + r' -!- ' + _TARGET + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P<message>.*?)\]'   ),
        (   LogTypes.MODE,          _TS + r' -!- mode/' + _CHAN + ' \[(?P<mode>.+?)\] by (?P<nickname>\S+)'                             ),
        (   LogTypes.NICK,          _TS + r' -!- ' + _NICK + ' is now known as (?P<target>\S+)'                                         ),
        (   LogTypes.QUIT,          _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P<message>.*?)\]'                     ),
        (   LogTypes.TOPIC,         _TS + r' -!- (' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P<topic>.*)|Topic unset by ' + _NICK2 + ' on ' + _CHAN2 + ')'    ),

        (   LogTypes.SELF_NOTICE,   _TS + r' \[notice\(' + _CHAN + '\)\] ' + _MSG                   ),
        (   LogTypes.SELF_NICK,     _TS + r' -!- You\'re now known as (?P<target>\S+)'              ),

        (   LogTypes.NETSPLIT_START,    _TS + r' -!- Netsplit ' + _SRV1 + ' <-> ' + _SRV2 + ' quits: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more,\S+\))?'),
        (   LogTypes.NETSPLIT_END,      _TS + r' -!- Netsplit over, joins: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more\))?'              ),

        (   'DAY_CHANGED',          r'--- Day changed (?P<date>.+)'                                 ),
    )

    # precompile
    TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS]

    def parse_line (self, channel, line, date, offset=None) :
        """
            Parse a single line, and return the resulting LogLine, or None, to ignore the line.

            Uses self.TYPE_REGEXES to do the matching
        """

        # empty line
        if not line :
            return

        # look for match
        match = type = None

        # test each type
        for type, regex in self.TYPE_REGEXES :
            # attempt to match
            match = regex.match(line)
            
            # found, break
            if match :
                break
        
        # no match found?
        if not match :
            raise LogParseError(line, offset, "Line did not match any type")
        
        # match groups
        groups = match.groupdict(None)

        # parse timestamp
        if 'datetime' in groups :
            # parse datetime using default asctime() format
            dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y')

        elif 'timestamp' in groups :
            # parse timestamp into naive datetime
            dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt)
            
            # override date?
            if date :
                dt = dt.replace(year=date.year, month=date.month, day=date.day)

        elif 'date' in groups :
            # parse date-only datetime
            dt = datetime.datetime.strptime(groups['date'], '%a %b %d %Y')

        else :
            # no timestamp !?
            raise LogParseError(line, offset, "No timestamp")

        # now localize with timezone
        dtz = self.tz.localize(dt)

        # channel, currently unused
        channel_name = (groups.get('channel') or groups.get('channel2'))

        # source
        if 'server1' in groups :
            source = (None, None, groups.get('server1'), None)

        else :
            source = (groups.get('nickname') or groups.get('nickname2'), groups.get('username'), groups.get('hostname'), groups.get('flags'))

        # target
        if 'server2' in groups :
            target = groups.get('server2')

        else :
            target = groups.get('target')

        # data
        if 'message' in groups :
            data = groups['message']
        
        elif 'mode' in groups :
            data = groups['mode']

        elif 'topic' in groups :
            data = groups['topic']
        
        elif 'nick_list' in groups :
            # split into components
            list = groups['nick_list'].split(', ')
            
            # additional count?
            if 'count' in groups and groups['count'] :
                list.append('+%d' % int(groups['count']))
            
            # join
            data = ' '.join(list)

        else :
            data = None
        
        # custom types?
        if type == 'DAY_CHANGED' :
            # new date
            date = dtz

        # build+return (date, LogLine)
        return date, LogLine(channel, offset, type, dtz, source, target, data)

    def parse_lines (self, channel, lines, date=None, starting_offset=None) :
        """
            Parse the given lines, yielding LogEvents. 
        """

        for offset, line in enumerate(lines) :
            # offset?
            if starting_offset :
                offset = starting_offset + offset

            else :
                offset = None
            
            # try and parse
            try :
                # update date as needed
                date, line = self.parse_line(channel, line, date, offset)
            
            # passthrough LogParseError's
            except LogParseError :
                raise
            
            # wrap other errors as LogParseError
            except Exception, e :
                raise LogParseError(line, offset, "Parsing line failed: %s" % e)
            
            else :
                # yield unless None
                if line :
                    yield line