diff -r 9c7769850195 -r 6db2527b67cf qmsk/irclogs/log_parser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qmsk/irclogs/log_parser.py Sun Sep 13 01:15:56 2009 +0300 @@ -0,0 +1,233 @@ +""" + Parse log data into log_events +""" + +import re +import datetime + +from log_line import LogTypes, LogLine + +class LogParseError (Exception) : + """ + Parsing some line failed + """ + + def __init__ (self, line, offset, message) : + super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message)) + +class LogParser (object) : + """ + Abstract interface + """ + + def __init__ (self, tz, timestamp_fmt="%H:%M:%S") : + """ + Setup the parser to use the given format for line timestamps, which are of the given timezone + """ + + self.tz = tz + self.timestamp_fmt = timestamp_fmt + + def parse_lines (self, channel, lines, date=None, starting_offset=None) : + """ + Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline. + + Channel is the LogChannel that these lines belong to. + + Offset is the starting offset, and may be None to not use it. + + Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date + information, event timestamps will have a date component of 1900/1/1. + """ + + abstract + +class IrssiParser (LogParser) : + """ + A parser for irssi logfiles + """ + + # timestamp prefix, with trailing space + _TS = r'(?P[a-zA-Z0-9: ]+[a-zA-Z0-9])\s*' + + # subexpression parts + _NICK = r'(?P.+?)' + _NICK2 = r'(?P.+?)' + _TARGET = r'(?P.+?)' + _CHAN = r'(?P.+?)' + _CHAN2 = r'(?P.+?)' + _USERHOST = r'(?P.*?)@(?P.*?)' + _MSG = r'(?P.*)' + _SRV1 = r'(?P.+?)' + _SRV2 = r'(?P.+?)' + + # regular expressions for matching lines, by type + TYPE_EXPRS = ( + ( LogTypes.LOG_OPEN, r'--- Log opened (?P.+)' ), + ( LogTypes.LOG_CLOSE, r'--- Log closed (?P.+)' ), + ( LogTypes.MSG, _TS + r'<(?P.)' + _NICK + '> ' + _MSG ), + ( LogTypes.NOTICE, _TS + r'-' + _NICK + ':' + _CHAN + '- ' + _MSG ), + ( LogTypes.ACTION, _TS + r'\* ' + _NICK + ' ' + _MSG ), + ( LogTypes.JOIN, _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN ), + ( LogTypes.PART, _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P.*?)\]' ), + ( LogTypes.KICK, _TS + r'-!- ' + _TARGET + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P.*?)\]' ), + # XXX: use hostname instead of nickname for ServerMode + ( LogTypes.MODE, _TS + r'-!- (mode|ServerMode)/' + _CHAN + ' \[(?P.+?)\] by (?P\S+)' ), + ( LogTypes.NICK, _TS + r'-!- ' + _NICK + ' is now known as (?P\S+)' ), + ( LogTypes.QUIT, _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P.*?)\]' ), + ( LogTypes.TOPIC, _TS + r'-!- (' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P.*)|Topic unset by ' + _NICK2 + ' on ' + _CHAN2 + ')' ), + + ( LogTypes.SELF_NOTICE, _TS + r'\[notice\(' + _CHAN + '\)\] ' + _MSG ), + ( LogTypes.SELF_NICK, _TS + r'-!- You\'re now known as (?P\S+)' ), + + ( LogTypes.NETSPLIT_START, _TS + r'-!- Netsplit ' + _SRV1 + ' <-> ' + _SRV2 + ' quits: (?P[^(]+)( \(\+(?P\d+) more,\S+\))?'), + ( LogTypes.NETSPLIT_END, _TS + r'-!- Netsplit over, joins: (?P[^(]+)( \(\+(?P\d+) more\))?' ), + + ( 'DAY_CHANGED', r'--- Day changed (?P.+)' ), + ) + + # precompile + TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS] + + def parse_line (self, channel, line, date, offset=None) : + """ + Parse a single line, and return the resulting LogLine, or None, to ignore the line. + + Uses self.TYPE_REGEXES to do the matching + """ + + # empty line + if not line : + return + + # look for match + match = type = None + + # test each type + for type, regex in self.TYPE_REGEXES : + # attempt to match + match = regex.match(line) + + # found, break + if match : + break + + # no match found? + if not match : + raise LogParseError(line, offset, "Line did not match any type") + + # match groups + groups = match.groupdict(None) + + # parse timestamp + if 'datetime' in groups : + # parse datetime using default asctime() format + dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y') + + elif 'timestamp' in groups : + # parse timestamp into naive datetime + dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt) + + # override date? + if date : + dt = dt.replace(year=date.year, month=date.month, day=date.day) + + elif 'date' in groups : + # parse date-only datetime + dt = datetime.datetime.strptime(groups['date'], '%a %b %d %Y') + + else : + # no timestamp !? + raise LogParseError(line, offset, "No timestamp") + + # now localize with timezone + dtz = self.tz.localize(dt) + + # channel, currently unused + channel_name = (groups.get('channel') or groups.get('channel2')) + + # source + if 'server1' in groups : + source = (None, None, groups.get('server1'), None) + + else : + source = (groups.get('nickname') or groups.get('nickname2'), groups.get('username'), groups.get('hostname'), groups.get('flags')) + + # target + if 'server2' in groups : + target = groups.get('server2') + + else : + target = groups.get('target') + + # data + if 'message' in groups : + data = groups['message'] + + elif 'mode' in groups : + data = groups['mode'] + + elif 'topic' in groups : + data = groups['topic'] + + elif 'nick_list' in groups : + # split into components + list = groups['nick_list'].split(', ') + + # additional count? + if 'count' in groups and groups['count'] : + list.append('+%d' % int(groups['count'])) + + # join + data = ' '.join(list) + + else : + data = None + + # custom types? + if type == 'DAY_CHANGED' : + # new date + date = dtz + + else : + # build+return (date, LogLine) + return date, LogLine(channel, offset, type, dtz, source, target, data) + + def parse_lines (self, channel, lines, date=None, starting_offset=None) : + """ + Parse the given lines, yielding LogEvents. + """ + + for offset, line in enumerate(lines) : + # offset? + if starting_offset : + offset = starting_offset + offset + + else : + offset = None + + # try and parse + try : + # get None or (date, line) + line_info = self.parse_line(channel, line, date, offset) + + # passthrough LogParseError's + except LogParseError : + raise + + # wrap other errors as LogParseError + except Exception, e : + raise LogParseError(line, offset, "Parsing line failed: %s" % e) + + else : + # nothing? + if not line_info : + continue + + # unpack, update date + date, line = line_info + + # yield + yield line + +