log_parser.py
author Tero Marttila <terom@fixme.fi>
Wed, 11 Feb 2009 02:07:07 +0200
changeset 93 48fca00689e3
parent 92 74f6a0b01ddf
child 97 6165f1ba458d
permissions -rw-r--r--
implement scripts/search-index autoload
"""
    Parse log data into log_events
"""

import re
import datetime

from log_line import LogTypes, LogLine

class LogParseError (Exception) :
    """
        Parsing some line failed
    """

    def __init__ (self, offset, line, message) :
        super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message))

class LogParser (object) :
    """
        Abstract interface
    """

    def __init__ (self, tz, timestamp_fmt="%H:%M:%S") :
        """
            Setup the parser to use the given format for line timestamps, which are of the given timezone
        """

        self.tz = tz
        self.timestamp_fmt = timestamp_fmt

    def parse_lines (self, channel, lines, date=None, starting_offset=None) :
        """
            Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline.

            Channel is the LogChannel that these lines belong to.

            Offset is the starting offset, and may be None to not use it.
            
            Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date
            information, event timestamps will have a date component of 1900/1/1.
        """

        abstract

class IrssiParser (LogParser) :
    """
        A parser for irssi logfiles
    """

    # subexpression parts
    _TS = r'(?P<timestamp>\S+)'
    _NICK = r'(?P<nickname>.+?)'
    _NICK2 = r'(?P<nickname2>.+?)'
    _TARGET = r'(?P<target>.+?)'
    _CHAN = r'(?P<channel>.+?)'
    _CHAN2 = r'(?P<channel2>.+?)'
    _USERHOST = r'(?P<username>.*?)@(?P<hostname>.*?)'
    _MSG = r'(?P<message>.*)'

    # regular expressions for matching lines, by type
    TYPE_EXPRS = (
        (   LogTypes.LOG_OPEN,      r'--- Log opened (?P<datetime>.+)'                              ),
        (   LogTypes.LOG_CLOSE,     r'--- Log closed (?P<datetime>.+)'                              ),
        (   LogTypes.MSG,           _TS + r' <(?P<flags>.)' + _NICK + '> ' + _MSG                   ),
        (   LogTypes.NOTICE,        _TS + r' -' + _NICK + ':' + _CHAN + '- ' + _MSG                 ),
        (   LogTypes.ACTION,        _TS + r'  \* ' + _NICK + ' ' + _MSG                             ),
        (   LogTypes.JOIN,          _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN                               ), 
        (   LogTypes.PART,          _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P<message>.*?)\]'       ),
        (   LogTypes.KICK,          _TS + r' -!- ' + _TARGET + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P<message>.*?)\]'   ),
        (   LogTypes.MODE,          _TS + r' -!- mode/' + _CHAN + ' \[(?P<mode>.+?)\] by (?P<nickname>\S+)'                             ),
        (   LogTypes.NICK,          _TS + r' -!- ' + _NICK + ' is now known as (?P<target>\S+)'                                         ),
        (   LogTypes.QUIT,          _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P<message>.*?)\]'                     ),
        (   LogTypes.TOPIC,         _TS + r' -!- (' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P<topic>.*)|Topic unset by ' + _NICK2 + ' on ' + _CHAN2 + ')'    ),

        (   LogTypes.SELF_NOTICE,   _TS + r' \[notice\(' + _CHAN + '\)\] ' + _MSG                   ),
        (   LogTypes.SELF_NICK,     _TS + r' -!- You\'re now known as (?P<target>\S+)'           ),
    )

    # precompile
    TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS]

    def parse_line (self, channel, line, date, offset=None) :
        """
            Parse a single line, and return the resulting LogLine, or None, to ignore the line.

            Uses self.TYPE_REGEXES to do the matching
        """

        # empty line
        if not line :
            return

        # look for match
        match = type = None

        # test each type
        for type, regex in self.TYPE_REGEXES :
            # attempt to match
            match = regex.match(line)
            
            # found, break
            if match :
                break
        
        # no match found?
        if not match :
            raise LogParseError(offset, line, "Line did not match any type")
        
        # match groups
        groups = match.groupdict(None)

        # parse timestamp
        if 'datetime' in groups :
            # parse datetime using default asctime() format
            dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y')

        elif 'timestamp' in groups :
            # parse timestamp into naive datetime
            dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt)
            
            # override date?
            if date :
                dt = dt.replace(year=date.year, month=date.month, day=date.day)

        else :
            # no timestamp !?
            raise LogParseError(offset, line, "No timestamp")

        # now localize with timezone
        dtz = self.tz.localize(dt)

        # channel, currently unused
        channel_name = (groups.get('channel') or groups.get('channel2'))

        # source
        source = (groups.get('nickname') or groups.get('nickname2'), groups.get('username'), groups.get('hostname'), groups.get('flags'))

        # target
        target = groups.get('target')

        # data
        if 'message' in groups :
            data = groups['message']
        
        elif 'mode' in groups :
            data = groups['mode']

        elif 'topic' in groups :
            data = groups['topic']
        
        else :
            data = None

        # build+return LogLine
        return LogLine(channel, offset, type, dtz, source, target, data)

    def parse_lines (self, channel, lines, date=None, starting_offset=None) :
        """
            Parse the given lines, yielding LogEvents. 
        """

        for offset, line in enumerate(lines) :
            # offset?
            if starting_offset :
                offset = starting_offset + offset

            else :
                offset = None
            
            # try and parse
            try :
                line = self.parse_line(channel, line, date, offset)
            
            # passthrough LogParseError's
            except LogParseError :
                raise
            
            # wrap other errors as LogParseError
            except Exception, e :
                raise LogParseError(line, offset, "Parsing line failed: %s" % e)
            
            else :
                # yield unless None
                if line :
                    yield line