log_parser.py
changeset 86 645cf9c4441e
parent 83 a34e9f56ddda
child 92 74f6a0b01ddf
equal deleted inserted replaced
85:0521cf830eb9 86:645cf9c4441e
     1 """
     1 """
     2     Parse log data into log_events
     2     Parse log data into log_events
     3 """
     3 """
     4 
     4 
       
     5 import re
     5 import datetime
     6 import datetime
     6 
     7 
     7 import log_line
     8 from log_line import LogTypes, LogLine
     8 from log_line import LogTypes
     9 
       
    10 class LogParseError (Exception) :
       
    11     """
       
    12         Parsing some line failed
       
    13     """
       
    14 
       
    15     def __init__ (self, offset, line, message) :
       
    16         super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message))
     9 
    17 
    10 class LogParser (object) :
    18 class LogParser (object) :
    11     """
    19     """
    12         Abstract interface
    20         Abstract interface
    13     """
    21     """
    18         """
    26         """
    19 
    27 
    20         self.tz = tz
    28         self.tz = tz
    21         self.timestamp_fmt = timestamp_fmt
    29         self.timestamp_fmt = timestamp_fmt
    22 
    30 
    23     def parse_lines (self, lines, date=None, starting_offset=None) :
    31     def parse_lines (self, channel, lines, date=None, starting_offset=None) :
    24         """
    32         """
    25             Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline.
    33             Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline.
       
    34 
       
    35             Channel is the LogChannel that these lines belong to.
    26 
    36 
    27             Offset is the starting offset, and may be None to not use it.
    37             Offset is the starting offset, and may be None to not use it.
    28             
    38             
    29             Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date
    39             Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date
    30             information, event timestamps will have a date component of 1900/1/1.
    40             information, event timestamps will have a date component of 1900/1/1.
    31         """
    41         """
    32 
    42 
    33         abstract
    43         abstract
    34 
    44 
    35 
       
    36 class IrssiParser (LogParser) :
    45 class IrssiParser (LogParser) :
    37     """
    46     """
    38         A parser for irssi logfiles
    47         A parser for irssi logfiles
    39     """
    48     """
    40 
    49 
    41     def parse_lines (self, lines, date=None, starting_offset=None) :
    50     # subexpression parts
       
    51     _TS = r'(?P<timestamp>\S+)'
       
    52     _NICK = r'(?P<nickname>.+?)'
       
    53     _NICK2 = r'(?P<nickname2>.+?)'
       
    54     _CHAN = r'(?P<channel>.+?)'
       
    55     _USERHOST = r'(?P<username>.*?)@(?P<hostname>.*?)'
       
    56     _MSG = r'(?P<message>.*)'
       
    57 
       
    58     # regular expressions for matching lines, by type
       
    59     TYPE_EXPRS = (
       
    60         (   LogTypes.LOG_OPEN,      r'--- Log opened (?P<datetime>.+)'                              ),
       
    61         (   LogTypes.LOG_CLOSE,     r'--- Log closed (?P<datetime>.+)'                              ),
       
    62         (   LogTypes.MSG,           _TS + r' <(?P<flags>.)' + _NICK + '> ' + _MSG                   ),
       
    63         (   LogTypes.NOTICE,        _TS + r' -' + _NICK + ':' + _CHAN + '- ' + _MSG                 ),
       
    64         (   LogTypes.ACTION,        _TS + r'  \* ' + _NICK + ' ' + _MSG                             ),
       
    65         (   LogTypes.JOIN,          _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN                               ), 
       
    66         (   LogTypes.PART,          _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P<message>.*?)\]'       ),
       
    67         (   LogTypes.KICK,          _TS + r' -!- ' + _NICK2 + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P<message>.*?)\]'    ),
       
    68         (   LogTypes.MODE,          _TS + r' -!- mode/' + _CHAN + ' \[(?P<mode>.+?)\] by (?P<nickname>\S+)'                             ),
       
    69         (   LogTypes.NICK,          _TS + r' -!- ' + _NICK + ' is now known as (?P<nickname2>\S+)'                                      ),
       
    70         (   LogTypes.QUIT,          _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P<message>.*?)\]'                     ),
       
    71         (   LogTypes.TOPIC,         _TS + r' -!- ' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P<topic>.*)'                    ),
       
    72 
       
    73         (   LogTypes.SELF_NOTICE,   _TS + r' \[notice\(' + _CHAN + '\)\] ' + _MSG                   ),
       
    74         (   LogTypes.SELF_NICK,     _TS + r' -!- You\'re now known as (?P<nickname2>\S+)'           ),
       
    75     )
       
    76 
       
    77     # precompile
       
    78     TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS]
       
    79 
       
    80     def parse_line (self, channel, line, date, offset=None) :
       
    81         """
       
    82             Parse a single line, and return the resulting LogLine, or None, to ignore the line.
       
    83 
       
    84             Uses self.TYPE_REGEXES to do the matching
       
    85         """
       
    86 
       
    87         # empty line
       
    88         if not line :
       
    89             return
       
    90 
       
    91         # look for match
       
    92         match = type = None
       
    93 
       
    94         # test each type
       
    95         for type, regex in self.TYPE_REGEXES :
       
    96             # attempt to match
       
    97             match = regex.match(line)
       
    98             
       
    99             # found, break
       
   100             if match :
       
   101                 break
       
   102         
       
   103         # no match found?
       
   104         if not match :
       
   105             raise LogParseError(offset, line, "Line did not match any type")
       
   106         
       
   107         # match groups
       
   108         groups = match.groupdict(None)
       
   109 
       
   110         # parse timestamp
       
   111         if 'datetime' in groups :
       
   112             # parse datetime using default asctime() format
       
   113             dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y')
       
   114 
       
   115         elif 'timestamp' in groups :
       
   116             # parse timestamp into naive datetime
       
   117             dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt)
       
   118             
       
   119             # override date?
       
   120             if date :
       
   121                 dt = dt.replace(year=date.year, month=date.month, day=date.day)
       
   122 
       
   123         else :
       
   124             # no timestamp !?
       
   125             raise LogParseError(offset, line, "No timestamp")
       
   126 
       
   127         # now localize with timezone
       
   128         dtz = self.tz.localize(dt)
       
   129 
       
   130         # source
       
   131         source = (groups.get('nickname'), groups.get('username'), groups.get('hostname'), groups.get('flags'))
       
   132 
       
   133         # target
       
   134         target = groups.get('nickname2')
       
   135 
       
   136         # data
       
   137         if 'message' in groups :
       
   138             data = groups['message']
       
   139         
       
   140         elif 'mode' in groups :
       
   141             data = groups['mode']
       
   142 
       
   143         elif 'topic' in groups :
       
   144             data = groups['topic']
       
   145         
       
   146         else :
       
   147             data = None
       
   148 
       
   149         # build+return LogLine
       
   150         return LogLine(channel, offset, type, dtz, source, target, data)
       
   151 
       
   152     def parse_lines (self, channel, lines, date=None, starting_offset=None) :
    42         """
   153         """
    43             Parse the given lines, yielding LogEvents. 
   154             Parse the given lines, yielding LogEvents. 
    44         """
   155         """
    45 
   156 
    46         for offset, line in enumerate(lines) :
   157         for offset, line in enumerate(lines) :
    51             else :
   162             else :
    52                 offset = None
   163                 offset = None
    53             
   164             
    54             # try and parse
   165             # try and parse
    55             try :
   166             try :
    56                 line = self.parse_line(line, date, offset)
   167                 line = self.parse_line(channel, line, date, offset)
    57 
   168             
       
   169             # passthrough LogParseError's
       
   170             except LogParseError :
       
   171                 raise
       
   172             
       
   173             # wrap other errors as LogParseError
    58             except Exception, e :
   174             except Exception, e :
    59                 raise Exception("Parsing line failed: %r@%d: %s" % (line, offset, e))
   175                 raise LogParseError(line, offset, "Parsing line failed: %s" % e)
    60             
   176             
    61             else :
   177             else :
    62                 # yield unless None
   178                 # yield unless None
    63                 if line :
   179                 if line :
    64                     yield line
   180                     yield line
    65 
   181 
    66     def parse_line (self, line, date, offset=None) :
       
    67         """
       
    68             Parse a single line, and return the resulting LogLine, or None, to ignore the line
       
    69         """
       
    70         
       
    71         # empty line
       
    72         if not line :
       
    73             return
       
    74 
   182 
    75         # status lines
       
    76         elif line.startswith('---') :
       
    77             # XXX: handle these
       
    78             return
       
    79         
       
    80         # normal lines
       
    81         else :
       
    82             # XXX: only parse timestamps for now
       
    83             timestamp, data = line.split(' ', 1)
       
    84             
       
    85             # parse timestamp into naive datetime
       
    86             dt = datetime.datetime.strptime(timestamp, self.timestamp_fmt)
       
    87             
       
    88             # override date?
       
    89             if date :
       
    90                 dt = dt.replace(year=date.year, month=date.month, day=date.day)
       
    91             
       
    92             # now localize with timezone
       
    93             dtz = self.tz.localize(dt)
       
    94 
       
    95             # build raw event
       
    96             return log_line.LogLine(offset, LogTypes.RAW, dtz, None, data)
       
    97