author Tero Marttila <terom@fixme.fi>
Tue, 28 Feb 2012 13:10:09 +0200
changeset 147 95b0a3fdd207
parent 140 6db2527b67cf
qmsk-irclogs.fcgi: using #! python, not python2.5
    Parse log data into log_events

import re
import datetime

from log_line import LogTypes, LogLine

class LogParseError (Exception) :
        Parsing some line failed

    def __init__ (self, line, offset, message) :
        super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message))

class LogParser (object) :
        Abstract interface

    def __init__ (self, tz, timestamp_fmt="%H:%M:%S") :
            Setup the parser to use the given format for line timestamps, which are of the given timezone

        self.tz = tz
        self.timestamp_fmt = timestamp_fmt

    def parse_lines (self, channel, lines, date=None, starting_offset=None) :
            Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline.

            Channel is the LogChannel that these lines belong to.

            Offset is the starting offset, and may be None to not use it.
            Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date
            information, event timestamps will have a date component of 1900/1/1.


class IrssiParser (LogParser) :
        A parser for irssi logfiles
    # timestamp prefix, with trailing space
    _TS = r'(?P<timestamp>[a-zA-Z0-9: ]+[a-zA-Z0-9])\s*'

    # subexpression parts
    _NICK = r'(?P<nickname>.+?)'
    _NICK2 = r'(?P<nickname2>.+?)'
    _TARGET = r'(?P<target>.+?)'
    _CHAN = r'(?P<channel>.+?)'
    _CHAN2 = r'(?P<channel2>.+?)'
    _USERHOST = r'(?P<username>.*?)@(?P<hostname>.*?)'
    _MSG = r'(?P<message>.*)'
    _SRV1 = r'(?P<server1>.+?)'
    _SRV2 = r'(?P<server2>.+?)'

    # regular expressions for matching lines, by type
    TYPE_EXPRS = (
        (   LogTypes.LOG_OPEN,      r'--- Log opened (?P<datetime>.+)'                              ),
        (   LogTypes.LOG_CLOSE,     r'--- Log closed (?P<datetime>.+)'                              ),
        (   LogTypes.MSG,           _TS + r'<(?P<flags>.)' + _NICK + '> ' + _MSG                   ),
        (   LogTypes.NOTICE,        _TS + r'-' + _NICK + ':' + _CHAN + '- ' + _MSG                 ),
        (   LogTypes.ACTION,        _TS + r'\* ' + _NICK + ' ' + _MSG                             ),
        (   LogTypes.JOIN,          _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN                               ), 
        (   LogTypes.PART,          _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P<message>.*?)\]'       ),
        (   LogTypes.KICK,          _TS + r'-!- ' + _TARGET + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P<message>.*?)\]'   ),
        # XXX: use hostname instead of nickname for ServerMode
        (   LogTypes.MODE,          _TS + r'-!- (mode|ServerMode)/' + _CHAN + ' \[(?P<mode>.+?)\] by (?P<nickname>\S+)'                ),
        (   LogTypes.NICK,          _TS + r'-!- ' + _NICK + ' is now known as (?P<target>\S+)'                                         ),
        (   LogTypes.QUIT,          _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P<message>.*?)\]'                     ),
        (   LogTypes.TOPIC,         _TS + r'-!- (' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P<topic>.*)|Topic unset by ' + _NICK2 + ' on ' + _CHAN2 + ')'    ),

        (   LogTypes.SELF_NOTICE,   _TS + r'\[notice\(' + _CHAN + '\)\] ' + _MSG                   ),
        (   LogTypes.SELF_NICK,     _TS + r'-!- You\'re now known as (?P<target>\S+)'              ),

        (   LogTypes.NETSPLIT_START,    _TS + r'-!- Netsplit ' + _SRV1 + ' <-> ' + _SRV2 + ' quits: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more,\S+\))?'),
        (   LogTypes.NETSPLIT_END,      _TS + r'-!- Netsplit over, joins: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more\))?'              ),

        (   'DAY_CHANGED',          r'--- Day changed (?P<date>.+)'                                 ),

    # precompile
    TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS]

    def parse_line (self, channel, line, date, offset=None) :
            Parse a single line, and return the resulting LogLine, or None, to ignore the line.

            Uses self.TYPE_REGEXES to do the matching

        # empty line
        if not line :

        # look for match
        match = type = None

        # test each type
        for type, regex in self.TYPE_REGEXES :
            # attempt to match
            match = regex.match(line)
            # found, break
            if match :
        # no match found?
        if not match :
            raise LogParseError(line, offset, "Line did not match any type")
        # match groups
        groups = match.groupdict(None)

        # parse timestamp
        if 'datetime' in groups :
            # parse datetime using default asctime() format
            dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y')

        elif 'timestamp' in groups :
            # parse timestamp into naive datetime
            dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt)
            # override date?
            if date :
                dt = dt.replace(year=date.year, month=date.month, day=date.day)

        elif 'date' in groups :
            # parse date-only datetime
            dt = datetime.datetime.strptime(groups['date'], '%a %b %d %Y')

        else :
            # no timestamp !?
            raise LogParseError(line, offset, "No timestamp")

        # now localize with timezone
        dtz = self.tz.localize(dt)

        # channel, currently unused
        channel_name = (groups.get('channel') or groups.get('channel2'))

        # source
        if 'server1' in groups :
            source = (None, None, groups.get('server1'), None)

        else :
            source = (groups.get('nickname') or groups.get('nickname2'), groups.get('username'), groups.get('hostname'), groups.get('flags'))

        # target
        if 'server2' in groups :
            target = groups.get('server2')

        else :
            target = groups.get('target')

        # data
        if 'message' in groups :
            data = groups['message']
        elif 'mode' in groups :
            data = groups['mode']

        elif 'topic' in groups :
            data = groups['topic']
        elif 'nick_list' in groups :
            # split into components
            list = groups['nick_list'].split(', ')
            # additional count?
            if 'count' in groups and groups['count'] :
                list.append('+%d' % int(groups['count']))
            # join
            data = ' '.join(list)

        else :
            data = None
        # custom types?
        if type == 'DAY_CHANGED' :
            # new date
            date = dtz
        else :
            # build+return (date, LogLine)
            return date, LogLine(channel, offset, type, dtz, source, target, data)

    def parse_lines (self, channel, lines, date=None, starting_offset=None) :
            Parse the given lines, yielding LogEvents. 

        for offset, line in enumerate(lines) :
            # offset?
            if starting_offset :
                offset = starting_offset + offset

            else :
                offset = None
            # try and parse
            try :
                # get None or (date, line)
                line_info = self.parse_line(channel, line, date, offset)

           # passthrough LogParseError's
            except LogParseError :
            # wrap other errors as LogParseError
            except Exception, e :
                raise LogParseError(line, offset, "Parsing line failed: %s" % e)
            else :
                # nothing?
                if not line_info :
                # unpack, update date
                date, line = line_info
                # yield
                yield line