terom@50: """
terom@50:     Parse log data into log_events
terom@50: """
terom@50: 
terom@86: import re
terom@50: import datetime
terom@50: 
terom@86: from log_line import LogTypes, LogLine
terom@86: 
terom@86: class LogParseError (Exception) :
terom@86:     """
terom@86:         Parsing some line failed
terom@86:     """
terom@86: 
terom@97:     def __init__ (self, line, offset, message) :
terom@86:         super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message))
terom@50: 
terom@50: class LogParser (object) :
terom@50:     """
terom@50:         Abstract interface
terom@50:     """
terom@50: 
terom@50:     def __init__ (self, tz, timestamp_fmt="%H:%M:%S") :
terom@50:         """
terom@50:             Setup the parser to use the given format for line timestamps, which are of the given timezone
terom@50:         """
terom@50: 
terom@50:         self.tz = tz
terom@50:         self.timestamp_fmt = timestamp_fmt
terom@50: 
terom@86:     def parse_lines (self, channel, lines, date=None, starting_offset=None) :
terom@50:         """
terom@50:             Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline.
terom@64: 
terom@86:             Channel is the LogChannel that these lines belong to.
terom@86: 
terom@64:             Offset is the starting offset, and may be None to not use it.
terom@50:             
terom@50:             Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date
terom@50:             information, event timestamps will have a date component of 1900/1/1.
terom@50:         """
terom@50: 
terom@50:         abstract
terom@50: 
terom@50: class IrssiParser (LogParser) :
terom@50:     """
terom@50:         A parser for irssi logfiles
terom@50:     """
terom@110:     
terom@110:     # timestamp prefix, with trailing space
terom@110:     _TS = r'(?P<timestamp>[a-zA-Z0-9: ]+[a-zA-Z0-9])\s*'
terom@50: 
terom@86:     # subexpression parts
terom@86:     _NICK = r'(?P<nickname>.+?)'
terom@86:     _NICK2 = r'(?P<nickname2>.+?)'
terom@92:     _TARGET = r'(?P<target>.+?)'
terom@86:     _CHAN = r'(?P<channel>.+?)'
terom@92:     _CHAN2 = r'(?P<channel2>.+?)'
terom@86:     _USERHOST = r'(?P<username>.*?)@(?P<hostname>.*?)'
terom@86:     _MSG = r'(?P<message>.*)'
terom@97:     _SRV1 = r'(?P<server1>.+?)'
terom@97:     _SRV2 = r'(?P<server2>.+?)'
terom@86: 
terom@86:     # regular expressions for matching lines, by type
terom@86:     TYPE_EXPRS = (
terom@86:         (   LogTypes.LOG_OPEN,      r'--- Log opened (?P<datetime>.+)'                              ),
terom@86:         (   LogTypes.LOG_CLOSE,     r'--- Log closed (?P<datetime>.+)'                              ),
terom@110:         (   LogTypes.MSG,           _TS + r'<(?P<flags>.)' + _NICK + '> ' + _MSG                   ),
terom@110:         (   LogTypes.NOTICE,        _TS + r'-' + _NICK + ':' + _CHAN + '- ' + _MSG                 ),
terom@110:         (   LogTypes.ACTION,        _TS + r'\* ' + _NICK + ' ' + _MSG                             ),
terom@110:         (   LogTypes.JOIN,          _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN                               ), 
terom@110:         (   LogTypes.PART,          _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P<message>.*?)\]'       ),
terom@110:         (   LogTypes.KICK,          _TS + r'-!- ' + _TARGET + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P<message>.*?)\]'   ),
terom@103:         # XXX: use hostname instead of nickname for ServerMode
terom@110:         (   LogTypes.MODE,          _TS + r'-!- (mode|ServerMode)/' + _CHAN + ' \[(?P<mode>.+?)\] by (?P<nickname>\S+)'                ),
terom@110:         (   LogTypes.NICK,          _TS + r'-!- ' + _NICK + ' is now known as (?P<target>\S+)'                                         ),
terom@110:         (   LogTypes.QUIT,          _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P<message>.*?)\]'                     ),
terom@110:         (   LogTypes.TOPIC,         _TS + r'-!- (' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P<topic>.*)|Topic unset by ' + _NICK2 + ' on ' + _CHAN2 + ')'    ),
terom@86: 
terom@110:         (   LogTypes.SELF_NOTICE,   _TS + r'\[notice\(' + _CHAN + '\)\] ' + _MSG                   ),
terom@110:         (   LogTypes.SELF_NICK,     _TS + r'-!- You\'re now known as (?P<target>\S+)'              ),
terom@97: 
terom@110:         (   LogTypes.NETSPLIT_START,    _TS + r'-!- Netsplit ' + _SRV1 + ' <-> ' + _SRV2 + ' quits: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more,\S+\))?'),
terom@110:         (   LogTypes.NETSPLIT_END,      _TS + r'-!- Netsplit over, joins: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more\))?'              ),
terom@97: 
terom@97:         (   'DAY_CHANGED',          r'--- Day changed (?P<date>.+)'                                 ),
terom@86:     )
terom@86: 
terom@86:     # precompile
terom@86:     TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS]
terom@86: 
terom@86:     def parse_line (self, channel, line, date, offset=None) :
terom@86:         """
terom@86:             Parse a single line, and return the resulting LogLine, or None, to ignore the line.
terom@86: 
terom@86:             Uses self.TYPE_REGEXES to do the matching
terom@86:         """
terom@86: 
terom@86:         # empty line
terom@86:         if not line :
terom@86:             return
terom@86: 
terom@86:         # look for match
terom@86:         match = type = None
terom@86: 
terom@86:         # test each type
terom@86:         for type, regex in self.TYPE_REGEXES :
terom@86:             # attempt to match
terom@86:             match = regex.match(line)
terom@86:             
terom@86:             # found, break
terom@86:             if match :
terom@86:                 break
terom@86:         
terom@86:         # no match found?
terom@86:         if not match :
terom@97:             raise LogParseError(line, offset, "Line did not match any type")
terom@86:         
terom@86:         # match groups
terom@86:         groups = match.groupdict(None)
terom@86: 
terom@86:         # parse timestamp
terom@86:         if 'datetime' in groups :
terom@86:             # parse datetime using default asctime() format
terom@86:             dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y')
terom@86: 
terom@86:         elif 'timestamp' in groups :
terom@86:             # parse timestamp into naive datetime
terom@86:             dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt)
terom@86:             
terom@86:             # override date?
terom@86:             if date :
terom@86:                 dt = dt.replace(year=date.year, month=date.month, day=date.day)
terom@86: 
terom@97:         elif 'date' in groups :
terom@97:             # parse date-only datetime
terom@97:             dt = datetime.datetime.strptime(groups['date'], '%a %b %d %Y')
terom@97: 
terom@86:         else :
terom@86:             # no timestamp !?
terom@97:             raise LogParseError(line, offset, "No timestamp")
terom@86: 
terom@86:         # now localize with timezone
terom@86:         dtz = self.tz.localize(dt)
terom@86: 
terom@92:         # channel, currently unused
terom@92:         channel_name = (groups.get('channel') or groups.get('channel2'))
terom@92: 
terom@86:         # source
terom@97:         if 'server1' in groups :
terom@97:             source = (None, None, groups.get('server1'), None)
terom@97: 
terom@97:         else :
terom@97:             source = (groups.get('nickname') or groups.get('nickname2'), groups.get('username'), groups.get('hostname'), groups.get('flags'))
terom@86: 
terom@86:         # target
terom@97:         if 'server2' in groups :
terom@97:             target = groups.get('server2')
terom@97: 
terom@97:         else :
terom@97:             target = groups.get('target')
terom@86: 
terom@86:         # data
terom@86:         if 'message' in groups :
terom@86:             data = groups['message']
terom@86:         
terom@86:         elif 'mode' in groups :
terom@86:             data = groups['mode']
terom@86: 
terom@86:         elif 'topic' in groups :
terom@86:             data = groups['topic']
terom@86:         
terom@97:         elif 'nick_list' in groups :
terom@97:             # split into components
terom@97:             list = groups['nick_list'].split(', ')
terom@97:             
terom@97:             # additional count?
terom@97:             if 'count' in groups and groups['count'] :
terom@97:                 list.append('+%d' % int(groups['count']))
terom@97:             
terom@97:             # join
terom@97:             data = ' '.join(list)
terom@97: 
terom@86:         else :
terom@86:             data = None
terom@97:         
terom@97:         # custom types?
terom@97:         if type == 'DAY_CHANGED' :
terom@97:             # new date
terom@97:             date = dtz
terom@109:         
terom@109:         else :
terom@109:             # build+return (date, LogLine)
terom@109:             return date, LogLine(channel, offset, type, dtz, source, target, data)
terom@86: 
terom@86:     def parse_lines (self, channel, lines, date=None, starting_offset=None) :
terom@50:         """
terom@50:             Parse the given lines, yielding LogEvents. 
terom@50:         """
terom@65: 
terom@64:         for offset, line in enumerate(lines) :
terom@83:             # offset?
terom@83:             if starting_offset :
terom@83:                 offset = starting_offset + offset
terom@83: 
terom@50:             else :
terom@83:                 offset = None
terom@83:             
terom@83:             # try and parse
terom@83:             try :
terom@109:                 # get None or (date, line)
terom@109:                 line_info = self.parse_line(channel, line, date, offset)
terom@109: 
terom@109:            # passthrough LogParseError's
terom@86:             except LogParseError :
terom@86:                 raise
terom@86:             
terom@86:             # wrap other errors as LogParseError
terom@83:             except Exception, e :
terom@86:                 raise LogParseError(line, offset, "Parsing line failed: %s" % e)
terom@83:             
terom@83:             else :
terom@109:                 # nothing?
terom@109:                 if not line_info :
terom@109:                     continue
terom@109:                 
terom@109:                 # unpack, update date
terom@109:                 date, line = line_info
terom@109:                 
terom@109:                 # yield
terom@109:                 yield line
terom@64: 
terom@64: