--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/qmsk/irclogs/log_parser.py Sun Sep 13 01:15:56 2009 +0300
@@ -0,0 +1,233 @@
+"""
+ Parse log data into log_events
+"""
+
+import re
+import datetime
+
+from log_line import LogTypes, LogLine
+
+class LogParseError (Exception) :
+ """
+ Parsing some line failed
+ """
+
+ def __init__ (self, line, offset, message) :
+ super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message))
+
+class LogParser (object) :
+ """
+ Abstract interface
+ """
+
+ def __init__ (self, tz, timestamp_fmt="%H:%M:%S") :
+ """
+ Setup the parser to use the given format for line timestamps, which are of the given timezone
+ """
+
+ self.tz = tz
+ self.timestamp_fmt = timestamp_fmt
+
+ def parse_lines (self, channel, lines, date=None, starting_offset=None) :
+ """
+ Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline.
+
+ Channel is the LogChannel that these lines belong to.
+
+ Offset is the starting offset, and may be None to not use it.
+
+ Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date
+ information, event timestamps will have a date component of 1900/1/1.
+ """
+
+ abstract
+
+class IrssiParser (LogParser) :
+ """
+ A parser for irssi logfiles
+ """
+
+ # timestamp prefix, with trailing space
+ _TS = r'(?P<timestamp>[a-zA-Z0-9: ]+[a-zA-Z0-9])\s*'
+
+ # subexpression parts
+ _NICK = r'(?P<nickname>.+?)'
+ _NICK2 = r'(?P<nickname2>.+?)'
+ _TARGET = r'(?P<target>.+?)'
+ _CHAN = r'(?P<channel>.+?)'
+ _CHAN2 = r'(?P<channel2>.+?)'
+ _USERHOST = r'(?P<username>.*?)@(?P<hostname>.*?)'
+ _MSG = r'(?P<message>.*)'
+ _SRV1 = r'(?P<server1>.+?)'
+ _SRV2 = r'(?P<server2>.+?)'
+
+ # regular expressions for matching lines, by type
+ TYPE_EXPRS = (
+ ( LogTypes.LOG_OPEN, r'--- Log opened (?P<datetime>.+)' ),
+ ( LogTypes.LOG_CLOSE, r'--- Log closed (?P<datetime>.+)' ),
+ ( LogTypes.MSG, _TS + r'<(?P<flags>.)' + _NICK + '> ' + _MSG ),
+ ( LogTypes.NOTICE, _TS + r'-' + _NICK + ':' + _CHAN + '- ' + _MSG ),
+ ( LogTypes.ACTION, _TS + r'\* ' + _NICK + ' ' + _MSG ),
+ ( LogTypes.JOIN, _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN ),
+ ( LogTypes.PART, _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P<message>.*?)\]' ),
+ ( LogTypes.KICK, _TS + r'-!- ' + _TARGET + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P<message>.*?)\]' ),
+ # XXX: use hostname instead of nickname for ServerMode
+ ( LogTypes.MODE, _TS + r'-!- (mode|ServerMode)/' + _CHAN + ' \[(?P<mode>.+?)\] by (?P<nickname>\S+)' ),
+ ( LogTypes.NICK, _TS + r'-!- ' + _NICK + ' is now known as (?P<target>\S+)' ),
+ ( LogTypes.QUIT, _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P<message>.*?)\]' ),
+ ( LogTypes.TOPIC, _TS + r'-!- (' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P<topic>.*)|Topic unset by ' + _NICK2 + ' on ' + _CHAN2 + ')' ),
+
+ ( LogTypes.SELF_NOTICE, _TS + r'\[notice\(' + _CHAN + '\)\] ' + _MSG ),
+ ( LogTypes.SELF_NICK, _TS + r'-!- You\'re now known as (?P<target>\S+)' ),
+
+ ( LogTypes.NETSPLIT_START, _TS + r'-!- Netsplit ' + _SRV1 + ' <-> ' + _SRV2 + ' quits: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more,\S+\))?'),
+ ( LogTypes.NETSPLIT_END, _TS + r'-!- Netsplit over, joins: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more\))?' ),
+
+ ( 'DAY_CHANGED', r'--- Day changed (?P<date>.+)' ),
+ )
+
+ # precompile
+ TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS]
+
+ def parse_line (self, channel, line, date, offset=None) :
+ """
+ Parse a single line, and return the resulting LogLine, or None, to ignore the line.
+
+ Uses self.TYPE_REGEXES to do the matching
+ """
+
+ # empty line
+ if not line :
+ return
+
+ # look for match
+ match = type = None
+
+ # test each type
+ for type, regex in self.TYPE_REGEXES :
+ # attempt to match
+ match = regex.match(line)
+
+ # found, break
+ if match :
+ break
+
+ # no match found?
+ if not match :
+ raise LogParseError(line, offset, "Line did not match any type")
+
+ # match groups
+ groups = match.groupdict(None)
+
+ # parse timestamp
+ if 'datetime' in groups :
+ # parse datetime using default asctime() format
+ dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y')
+
+ elif 'timestamp' in groups :
+ # parse timestamp into naive datetime
+ dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt)
+
+ # override date?
+ if date :
+ dt = dt.replace(year=date.year, month=date.month, day=date.day)
+
+ elif 'date' in groups :
+ # parse date-only datetime
+ dt = datetime.datetime.strptime(groups['date'], '%a %b %d %Y')
+
+ else :
+ # no timestamp !?
+ raise LogParseError(line, offset, "No timestamp")
+
+ # now localize with timezone
+ dtz = self.tz.localize(dt)
+
+ # channel, currently unused
+ channel_name = (groups.get('channel') or groups.get('channel2'))
+
+ # source
+ if 'server1' in groups :
+ source = (None, None, groups.get('server1'), None)
+
+ else :
+ source = (groups.get('nickname') or groups.get('nickname2'), groups.get('username'), groups.get('hostname'), groups.get('flags'))
+
+ # target
+ if 'server2' in groups :
+ target = groups.get('server2')
+
+ else :
+ target = groups.get('target')
+
+ # data
+ if 'message' in groups :
+ data = groups['message']
+
+ elif 'mode' in groups :
+ data = groups['mode']
+
+ elif 'topic' in groups :
+ data = groups['topic']
+
+ elif 'nick_list' in groups :
+ # split into components
+ list = groups['nick_list'].split(', ')
+
+ # additional count?
+ if 'count' in groups and groups['count'] :
+ list.append('+%d' % int(groups['count']))
+
+ # join
+ data = ' '.join(list)
+
+ else :
+ data = None
+
+ # custom types?
+ if type == 'DAY_CHANGED' :
+ # new date
+ date = dtz
+
+ else :
+ # build+return (date, LogLine)
+ return date, LogLine(channel, offset, type, dtz, source, target, data)
+
+ def parse_lines (self, channel, lines, date=None, starting_offset=None) :
+ """
+ Parse the given lines, yielding LogEvents.
+ """
+
+ for offset, line in enumerate(lines) :
+ # offset?
+ if starting_offset :
+ offset = starting_offset + offset
+
+ else :
+ offset = None
+
+ # try and parse
+ try :
+ # get None or (date, line)
+ line_info = self.parse_line(channel, line, date, offset)
+
+ # passthrough LogParseError's
+ except LogParseError :
+ raise
+
+ # wrap other errors as LogParseError
+ except Exception, e :
+ raise LogParseError(line, offset, "Parsing line failed: %s" % e)
+
+ else :
+ # nothing?
+ if not line_info :
+ continue
+
+ # unpack, update date
+ date, line = line_info
+
+ # yield
+ yield line
+
+