terom@50: """ terom@50: Parse log data into log_events terom@50: """ terom@50: terom@86: import re terom@50: import datetime terom@50: terom@86: from log_line import LogTypes, LogLine terom@86: terom@86: class LogParseError (Exception) : terom@86: """ terom@86: Parsing some line failed terom@86: """ terom@86: terom@97: def __init__ (self, line, offset, message) : terom@86: super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message)) terom@50: terom@50: class LogParser (object) : terom@50: """ terom@50: Abstract interface terom@50: """ terom@50: terom@50: def __init__ (self, tz, timestamp_fmt="%H:%M:%S") : terom@50: """ terom@50: Setup the parser to use the given format for line timestamps, which are of the given timezone terom@50: """ terom@50: terom@50: self.tz = tz terom@50: self.timestamp_fmt = timestamp_fmt terom@50: terom@86: def parse_lines (self, channel, lines, date=None, starting_offset=None) : terom@50: """ terom@50: Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline. terom@64: terom@86: Channel is the LogChannel that these lines belong to. terom@86: terom@64: Offset is the starting offset, and may be None to not use it. terom@50: terom@50: Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date terom@50: information, event timestamps will have a date component of 1900/1/1. terom@50: """ terom@50: terom@50: abstract terom@50: terom@50: class IrssiParser (LogParser) : terom@50: """ terom@50: A parser for irssi logfiles terom@50: """ terom@110: terom@110: # timestamp prefix, with trailing space terom@110: _TS = r'(?P[a-zA-Z0-9: ]+[a-zA-Z0-9])\s*' terom@50: terom@86: # subexpression parts terom@86: _NICK = r'(?P.+?)' terom@86: _NICK2 = r'(?P.+?)' terom@92: _TARGET = r'(?P.+?)' terom@86: _CHAN = r'(?P.+?)' terom@92: _CHAN2 = r'(?P.+?)' terom@86: _USERHOST = r'(?P.*?)@(?P.*?)' terom@86: _MSG = r'(?P.*)' terom@97: _SRV1 = r'(?P.+?)' terom@97: _SRV2 = r'(?P.+?)' terom@86: terom@86: # regular expressions for matching lines, by type terom@86: TYPE_EXPRS = ( terom@86: ( LogTypes.LOG_OPEN, r'--- Log opened (?P.+)' ), terom@86: ( LogTypes.LOG_CLOSE, r'--- Log closed (?P.+)' ), terom@110: ( LogTypes.MSG, _TS + r'<(?P.)' + _NICK + '> ' + _MSG ), terom@110: ( LogTypes.NOTICE, _TS + r'-' + _NICK + ':' + _CHAN + '- ' + _MSG ), terom@110: ( LogTypes.ACTION, _TS + r'\* ' + _NICK + ' ' + _MSG ), terom@110: ( LogTypes.JOIN, _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN ), terom@110: ( LogTypes.PART, _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P.*?)\]' ), terom@110: ( LogTypes.KICK, _TS + r'-!- ' + _TARGET + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P.*?)\]' ), terom@103: # XXX: use hostname instead of nickname for ServerMode terom@110: ( LogTypes.MODE, _TS + r'-!- (mode|ServerMode)/' + _CHAN + ' \[(?P.+?)\] by (?P\S+)' ), terom@110: ( LogTypes.NICK, _TS + r'-!- ' + _NICK + ' is now known as (?P\S+)' ), terom@110: ( LogTypes.QUIT, _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P.*?)\]' ), terom@110: ( LogTypes.TOPIC, _TS + r'-!- (' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P.*)|Topic unset by ' + _NICK2 + ' on ' + _CHAN2 + ')' ), terom@86: terom@110: ( LogTypes.SELF_NOTICE, _TS + r'\[notice\(' + _CHAN + '\)\] ' + _MSG ), terom@110: ( LogTypes.SELF_NICK, _TS + r'-!- You\'re now known as (?P\S+)' ), terom@97: terom@110: ( LogTypes.NETSPLIT_START, _TS + r'-!- Netsplit ' + _SRV1 + ' <-> ' + _SRV2 + ' quits: (?P[^(]+)( \(\+(?P\d+) more,\S+\))?'), terom@110: ( LogTypes.NETSPLIT_END, _TS + r'-!- Netsplit over, joins: (?P[^(]+)( \(\+(?P\d+) more\))?' ), terom@97: terom@97: ( 'DAY_CHANGED', r'--- Day changed (?P.+)' ), terom@86: ) terom@86: terom@86: # precompile terom@86: TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS] terom@86: terom@86: def parse_line (self, channel, line, date, offset=None) : terom@86: """ terom@86: Parse a single line, and return the resulting LogLine, or None, to ignore the line. terom@86: terom@86: Uses self.TYPE_REGEXES to do the matching terom@86: """ terom@86: terom@86: # empty line terom@86: if not line : terom@86: return terom@86: terom@86: # look for match terom@86: match = type = None terom@86: terom@86: # test each type terom@86: for type, regex in self.TYPE_REGEXES : terom@86: # attempt to match terom@86: match = regex.match(line) terom@86: terom@86: # found, break terom@86: if match : terom@86: break terom@86: terom@86: # no match found? terom@86: if not match : terom@97: raise LogParseError(line, offset, "Line did not match any type") terom@86: terom@86: # match groups terom@86: groups = match.groupdict(None) terom@86: terom@86: # parse timestamp terom@86: if 'datetime' in groups : terom@86: # parse datetime using default asctime() format terom@86: dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y') terom@86: terom@86: elif 'timestamp' in groups : terom@86: # parse timestamp into naive datetime terom@86: dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt) terom@86: terom@86: # override date? terom@86: if date : terom@86: dt = dt.replace(year=date.year, month=date.month, day=date.day) terom@86: terom@97: elif 'date' in groups : terom@97: # parse date-only datetime terom@97: dt = datetime.datetime.strptime(groups['date'], '%a %b %d %Y') terom@97: terom@86: else : terom@86: # no timestamp !? terom@97: raise LogParseError(line, offset, "No timestamp") terom@86: terom@86: # now localize with timezone terom@86: dtz = self.tz.localize(dt) terom@86: terom@92: # channel, currently unused terom@92: channel_name = (groups.get('channel') or groups.get('channel2')) terom@92: terom@86: # source terom@97: if 'server1' in groups : terom@97: source = (None, None, groups.get('server1'), None) terom@97: terom@97: else : terom@97: source = (groups.get('nickname') or groups.get('nickname2'), groups.get('username'), groups.get('hostname'), groups.get('flags')) terom@86: terom@86: # target terom@97: if 'server2' in groups : terom@97: target = groups.get('server2') terom@97: terom@97: else : terom@97: target = groups.get('target') terom@86: terom@86: # data terom@86: if 'message' in groups : terom@86: data = groups['message'] terom@86: terom@86: elif 'mode' in groups : terom@86: data = groups['mode'] terom@86: terom@86: elif 'topic' in groups : terom@86: data = groups['topic'] terom@86: terom@97: elif 'nick_list' in groups : terom@97: # split into components terom@97: list = groups['nick_list'].split(', ') terom@97: terom@97: # additional count? terom@97: if 'count' in groups and groups['count'] : terom@97: list.append('+%d' % int(groups['count'])) terom@97: terom@97: # join terom@97: data = ' '.join(list) terom@97: terom@86: else : terom@86: data = None terom@97: terom@97: # custom types? terom@97: if type == 'DAY_CHANGED' : terom@97: # new date terom@97: date = dtz terom@109: terom@109: else : terom@109: # build+return (date, LogLine) terom@109: return date, LogLine(channel, offset, type, dtz, source, target, data) terom@86: terom@86: def parse_lines (self, channel, lines, date=None, starting_offset=None) : terom@50: """ terom@50: Parse the given lines, yielding LogEvents. terom@50: """ terom@65: terom@64: for offset, line in enumerate(lines) : terom@83: # offset? terom@83: if starting_offset : terom@83: offset = starting_offset + offset terom@83: terom@50: else : terom@83: offset = None terom@83: terom@83: # try and parse terom@83: try : terom@109: # get None or (date, line) terom@109: line_info = self.parse_line(channel, line, date, offset) terom@109: terom@109: # passthrough LogParseError's terom@86: except LogParseError : terom@86: raise terom@86: terom@86: # wrap other errors as LogParseError terom@83: except Exception, e : terom@86: raise LogParseError(line, offset, "Parsing line failed: %s" % e) terom@83: terom@83: else : terom@109: # nothing? terom@109: if not line_info : terom@109: continue terom@109: terom@109: # unpack, update date terom@109: date, line = line_info terom@109: terom@109: # yield terom@109: yield line terom@64: terom@64: