diff -r 0521cf830eb9 -r 645cf9c4441e log_parser.py --- a/log_parser.py Tue Feb 10 22:59:52 2009 +0200 +++ b/log_parser.py Tue Feb 10 23:00:11 2009 +0200 @@ -2,10 +2,18 @@ Parse log data into log_events """ +import re import datetime -import log_line -from log_line import LogTypes +from log_line import LogTypes, LogLine + +class LogParseError (Exception) : + """ + Parsing some line failed + """ + + def __init__ (self, offset, line, message) : + super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message)) class LogParser (object) : """ @@ -20,10 +28,12 @@ self.tz = tz self.timestamp_fmt = timestamp_fmt - def parse_lines (self, lines, date=None, starting_offset=None) : + def parse_lines (self, channel, lines, date=None, starting_offset=None) : """ Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline. + Channel is the LogChannel that these lines belong to. + Offset is the starting offset, and may be None to not use it. Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date @@ -32,13 +42,114 @@ abstract - class IrssiParser (LogParser) : """ A parser for irssi logfiles """ - def parse_lines (self, lines, date=None, starting_offset=None) : + # subexpression parts + _TS = r'(?P\S+)' + _NICK = r'(?P.+?)' + _NICK2 = r'(?P.+?)' + _CHAN = r'(?P.+?)' + _USERHOST = r'(?P.*?)@(?P.*?)' + _MSG = r'(?P.*)' + + # regular expressions for matching lines, by type + TYPE_EXPRS = ( + ( LogTypes.LOG_OPEN, r'--- Log opened (?P.+)' ), + ( LogTypes.LOG_CLOSE, r'--- Log closed (?P.+)' ), + ( LogTypes.MSG, _TS + r' <(?P.)' + _NICK + '> ' + _MSG ), + ( LogTypes.NOTICE, _TS + r' -' + _NICK + ':' + _CHAN + '- ' + _MSG ), + ( LogTypes.ACTION, _TS + r' \* ' + _NICK + ' ' + _MSG ), + ( LogTypes.JOIN, _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN ), + ( LogTypes.PART, _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P.*?)\]' ), + ( LogTypes.KICK, _TS + r' -!- ' + _NICK2 + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P.*?)\]' ), + ( LogTypes.MODE, _TS + r' -!- mode/' + _CHAN + ' \[(?P.+?)\] by (?P\S+)' ), + ( LogTypes.NICK, _TS + r' -!- ' + _NICK + ' is now known as (?P\S+)' ), + ( LogTypes.QUIT, _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P.*?)\]' ), + ( LogTypes.TOPIC, _TS + r' -!- ' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P.*)' ), + + ( LogTypes.SELF_NOTICE, _TS + r' \[notice\(' + _CHAN + '\)\] ' + _MSG ), + ( LogTypes.SELF_NICK, _TS + r' -!- You\'re now known as (?P\S+)' ), + ) + + # precompile + TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS] + + def parse_line (self, channel, line, date, offset=None) : + """ + Parse a single line, and return the resulting LogLine, or None, to ignore the line. + + Uses self.TYPE_REGEXES to do the matching + """ + + # empty line + if not line : + return + + # look for match + match = type = None + + # test each type + for type, regex in self.TYPE_REGEXES : + # attempt to match + match = regex.match(line) + + # found, break + if match : + break + + # no match found? + if not match : + raise LogParseError(offset, line, "Line did not match any type") + + # match groups + groups = match.groupdict(None) + + # parse timestamp + if 'datetime' in groups : + # parse datetime using default asctime() format + dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y') + + elif 'timestamp' in groups : + # parse timestamp into naive datetime + dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt) + + # override date? + if date : + dt = dt.replace(year=date.year, month=date.month, day=date.day) + + else : + # no timestamp !? + raise LogParseError(offset, line, "No timestamp") + + # now localize with timezone + dtz = self.tz.localize(dt) + + # source + source = (groups.get('nickname'), groups.get('username'), groups.get('hostname'), groups.get('flags')) + + # target + target = groups.get('nickname2') + + # data + if 'message' in groups : + data = groups['message'] + + elif 'mode' in groups : + data = groups['mode'] + + elif 'topic' in groups : + data = groups['topic'] + + else : + data = None + + # build+return LogLine + return LogLine(channel, offset, type, dtz, source, target, data) + + def parse_lines (self, channel, lines, date=None, starting_offset=None) : """ Parse the given lines, yielding LogEvents. """ @@ -53,45 +164,19 @@ # try and parse try : - line = self.parse_line(line, date, offset) - + line = self.parse_line(channel, line, date, offset) + + # passthrough LogParseError's + except LogParseError : + raise + + # wrap other errors as LogParseError except Exception, e : - raise Exception("Parsing line failed: %r@%d: %s" % (line, offset, e)) + raise LogParseError(line, offset, "Parsing line failed: %s" % e) else : # yield unless None if line : yield line - def parse_line (self, line, date, offset=None) : - """ - Parse a single line, and return the resulting LogLine, or None, to ignore the line - """ - - # empty line - if not line : - return - # status lines - elif line.startswith('---') : - # XXX: handle these - return - - # normal lines - else : - # XXX: only parse timestamps for now - timestamp, data = line.split(' ', 1) - - # parse timestamp into naive datetime - dt = datetime.datetime.strptime(timestamp, self.timestamp_fmt) - - # override date? - if date : - dt = dt.replace(year=date.year, month=date.month, day=date.day) - - # now localize with timezone - dtz = self.tz.localize(dt) - - # build raw event - return log_line.LogLine(offset, LogTypes.RAW, dtz, None, data) -