diff -r 9c7769850195 -r 6db2527b67cf log_parser.py --- a/log_parser.py Sun Sep 13 00:49:55 2009 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,233 +0,0 @@ -""" - Parse log data into log_events -""" - -import re -import datetime - -from log_line import LogTypes, LogLine - -class LogParseError (Exception) : - """ - Parsing some line failed - """ - - def __init__ (self, line, offset, message) : - super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message)) - -class LogParser (object) : - """ - Abstract interface - """ - - def __init__ (self, tz, timestamp_fmt="%H:%M:%S") : - """ - Setup the parser to use the given format for line timestamps, which are of the given timezone - """ - - self.tz = tz - self.timestamp_fmt = timestamp_fmt - - def parse_lines (self, channel, lines, date=None, starting_offset=None) : - """ - Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline. - - Channel is the LogChannel that these lines belong to. - - Offset is the starting offset, and may be None to not use it. - - Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date - information, event timestamps will have a date component of 1900/1/1. - """ - - abstract - -class IrssiParser (LogParser) : - """ - A parser for irssi logfiles - """ - - # timestamp prefix, with trailing space - _TS = r'(?P[a-zA-Z0-9: ]+[a-zA-Z0-9])\s*' - - # subexpression parts - _NICK = r'(?P.+?)' - _NICK2 = r'(?P.+?)' - _TARGET = r'(?P.+?)' - _CHAN = r'(?P.+?)' - _CHAN2 = r'(?P.+?)' - _USERHOST = r'(?P.*?)@(?P.*?)' - _MSG = r'(?P.*)' - _SRV1 = r'(?P.+?)' - _SRV2 = r'(?P.+?)' - - # regular expressions for matching lines, by type - TYPE_EXPRS = ( - ( LogTypes.LOG_OPEN, r'--- Log opened (?P.+)' ), - ( LogTypes.LOG_CLOSE, r'--- Log closed (?P.+)' ), - ( LogTypes.MSG, _TS + r'<(?P.)' + _NICK + '> ' + _MSG ), - ( LogTypes.NOTICE, _TS + r'-' + _NICK + ':' + _CHAN + '- ' + _MSG ), - ( LogTypes.ACTION, _TS + r'\* ' + _NICK + ' ' + _MSG ), - ( LogTypes.JOIN, _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN ), - ( LogTypes.PART, _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P.*?)\]' ), - ( LogTypes.KICK, _TS + r'-!- ' + _TARGET + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P.*?)\]' ), - # XXX: use hostname instead of nickname for ServerMode - ( LogTypes.MODE, _TS + r'-!- (mode|ServerMode)/' + _CHAN + ' \[(?P.+?)\] by (?P\S+)' ), - ( LogTypes.NICK, _TS + r'-!- ' + _NICK + ' is now known as (?P\S+)' ), - ( LogTypes.QUIT, _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P.*?)\]' ), - ( LogTypes.TOPIC, _TS + r'-!- (' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P.*)|Topic unset by ' + _NICK2 + ' on ' + _CHAN2 + ')' ), - - ( LogTypes.SELF_NOTICE, _TS + r'\[notice\(' + _CHAN + '\)\] ' + _MSG ), - ( LogTypes.SELF_NICK, _TS + r'-!- You\'re now known as (?P\S+)' ), - - ( LogTypes.NETSPLIT_START, _TS + r'-!- Netsplit ' + _SRV1 + ' <-> ' + _SRV2 + ' quits: (?P[^(]+)( \(\+(?P\d+) more,\S+\))?'), - ( LogTypes.NETSPLIT_END, _TS + r'-!- Netsplit over, joins: (?P[^(]+)( \(\+(?P\d+) more\))?' ), - - ( 'DAY_CHANGED', r'--- Day changed (?P.+)' ), - ) - - # precompile - TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS] - - def parse_line (self, channel, line, date, offset=None) : - """ - Parse a single line, and return the resulting LogLine, or None, to ignore the line. - - Uses self.TYPE_REGEXES to do the matching - """ - - # empty line - if not line : - return - - # look for match - match = type = None - - # test each type - for type, regex in self.TYPE_REGEXES : - # attempt to match - match = regex.match(line) - - # found, break - if match : - break - - # no match found? - if not match : - raise LogParseError(line, offset, "Line did not match any type") - - # match groups - groups = match.groupdict(None) - - # parse timestamp - if 'datetime' in groups : - # parse datetime using default asctime() format - dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y') - - elif 'timestamp' in groups : - # parse timestamp into naive datetime - dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt) - - # override date? - if date : - dt = dt.replace(year=date.year, month=date.month, day=date.day) - - elif 'date' in groups : - # parse date-only datetime - dt = datetime.datetime.strptime(groups['date'], '%a %b %d %Y') - - else : - # no timestamp !? - raise LogParseError(line, offset, "No timestamp") - - # now localize with timezone - dtz = self.tz.localize(dt) - - # channel, currently unused - channel_name = (groups.get('channel') or groups.get('channel2')) - - # source - if 'server1' in groups : - source = (None, None, groups.get('server1'), None) - - else : - source = (groups.get('nickname') or groups.get('nickname2'), groups.get('username'), groups.get('hostname'), groups.get('flags')) - - # target - if 'server2' in groups : - target = groups.get('server2') - - else : - target = groups.get('target') - - # data - if 'message' in groups : - data = groups['message'] - - elif 'mode' in groups : - data = groups['mode'] - - elif 'topic' in groups : - data = groups['topic'] - - elif 'nick_list' in groups : - # split into components - list = groups['nick_list'].split(', ') - - # additional count? - if 'count' in groups and groups['count'] : - list.append('+%d' % int(groups['count'])) - - # join - data = ' '.join(list) - - else : - data = None - - # custom types? - if type == 'DAY_CHANGED' : - # new date - date = dtz - - else : - # build+return (date, LogLine) - return date, LogLine(channel, offset, type, dtz, source, target, data) - - def parse_lines (self, channel, lines, date=None, starting_offset=None) : - """ - Parse the given lines, yielding LogEvents. - """ - - for offset, line in enumerate(lines) : - # offset? - if starting_offset : - offset = starting_offset + offset - - else : - offset = None - - # try and parse - try : - # get None or (date, line) - line_info = self.parse_line(channel, line, date, offset) - - # passthrough LogParseError's - except LogParseError : - raise - - # wrap other errors as LogParseError - except Exception, e : - raise LogParseError(line, offset, "Parsing line failed: %s" % e) - - else : - # nothing? - if not line_info : - continue - - # unpack, update date - date, line = line_info - - # yield - yield line - -