qmsk/irclogs/log_parser.py
changeset 140 6db2527b67cf
parent 110 37e67ec434f3
equal deleted inserted replaced
139:9c7769850195 140:6db2527b67cf
       
     1 """
       
     2     Parse log data into log_events
       
     3 """
       
     4 
       
     5 import re
       
     6 import datetime
       
     7 
       
     8 from log_line import LogTypes, LogLine
       
     9 
       
    10 class LogParseError (Exception) :
       
    11     """
       
    12         Parsing some line failed
       
    13     """
       
    14 
       
    15     def __init__ (self, line, offset, message) :
       
    16         super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message))
       
    17 
       
    18 class LogParser (object) :
       
    19     """
       
    20         Abstract interface
       
    21     """
       
    22 
       
    23     def __init__ (self, tz, timestamp_fmt="%H:%M:%S") :
       
    24         """
       
    25             Setup the parser to use the given format for line timestamps, which are of the given timezone
       
    26         """
       
    27 
       
    28         self.tz = tz
       
    29         self.timestamp_fmt = timestamp_fmt
       
    30 
       
    31     def parse_lines (self, channel, lines, date=None, starting_offset=None) :
       
    32         """
       
    33             Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline.
       
    34 
       
    35             Channel is the LogChannel that these lines belong to.
       
    36 
       
    37             Offset is the starting offset, and may be None to not use it.
       
    38             
       
    39             Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date
       
    40             information, event timestamps will have a date component of 1900/1/1.
       
    41         """
       
    42 
       
    43         abstract
       
    44 
       
    45 class IrssiParser (LogParser) :
       
    46     """
       
    47         A parser for irssi logfiles
       
    48     """
       
    49     
       
    50     # timestamp prefix, with trailing space
       
    51     _TS = r'(?P<timestamp>[a-zA-Z0-9: ]+[a-zA-Z0-9])\s*'
       
    52 
       
    53     # subexpression parts
       
    54     _NICK = r'(?P<nickname>.+?)'
       
    55     _NICK2 = r'(?P<nickname2>.+?)'
       
    56     _TARGET = r'(?P<target>.+?)'
       
    57     _CHAN = r'(?P<channel>.+?)'
       
    58     _CHAN2 = r'(?P<channel2>.+?)'
       
    59     _USERHOST = r'(?P<username>.*?)@(?P<hostname>.*?)'
       
    60     _MSG = r'(?P<message>.*)'
       
    61     _SRV1 = r'(?P<server1>.+?)'
       
    62     _SRV2 = r'(?P<server2>.+?)'
       
    63 
       
    64     # regular expressions for matching lines, by type
       
    65     TYPE_EXPRS = (
       
    66         (   LogTypes.LOG_OPEN,      r'--- Log opened (?P<datetime>.+)'                              ),
       
    67         (   LogTypes.LOG_CLOSE,     r'--- Log closed (?P<datetime>.+)'                              ),
       
    68         (   LogTypes.MSG,           _TS + r'<(?P<flags>.)' + _NICK + '> ' + _MSG                   ),
       
    69         (   LogTypes.NOTICE,        _TS + r'-' + _NICK + ':' + _CHAN + '- ' + _MSG                 ),
       
    70         (   LogTypes.ACTION,        _TS + r'\* ' + _NICK + ' ' + _MSG                             ),
       
    71         (   LogTypes.JOIN,          _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN                               ), 
       
    72         (   LogTypes.PART,          _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P<message>.*?)\]'       ),
       
    73         (   LogTypes.KICK,          _TS + r'-!- ' + _TARGET + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P<message>.*?)\]'   ),
       
    74         # XXX: use hostname instead of nickname for ServerMode
       
    75         (   LogTypes.MODE,          _TS + r'-!- (mode|ServerMode)/' + _CHAN + ' \[(?P<mode>.+?)\] by (?P<nickname>\S+)'                ),
       
    76         (   LogTypes.NICK,          _TS + r'-!- ' + _NICK + ' is now known as (?P<target>\S+)'                                         ),
       
    77         (   LogTypes.QUIT,          _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P<message>.*?)\]'                     ),
       
    78         (   LogTypes.TOPIC,         _TS + r'-!- (' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P<topic>.*)|Topic unset by ' + _NICK2 + ' on ' + _CHAN2 + ')'    ),
       
    79 
       
    80         (   LogTypes.SELF_NOTICE,   _TS + r'\[notice\(' + _CHAN + '\)\] ' + _MSG                   ),
       
    81         (   LogTypes.SELF_NICK,     _TS + r'-!- You\'re now known as (?P<target>\S+)'              ),
       
    82 
       
    83         (   LogTypes.NETSPLIT_START,    _TS + r'-!- Netsplit ' + _SRV1 + ' <-> ' + _SRV2 + ' quits: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more,\S+\))?'),
       
    84         (   LogTypes.NETSPLIT_END,      _TS + r'-!- Netsplit over, joins: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more\))?'              ),
       
    85 
       
    86         (   'DAY_CHANGED',          r'--- Day changed (?P<date>.+)'                                 ),
       
    87     )
       
    88 
       
    89     # precompile
       
    90     TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS]
       
    91 
       
    92     def parse_line (self, channel, line, date, offset=None) :
       
    93         """
       
    94             Parse a single line, and return the resulting LogLine, or None, to ignore the line.
       
    95 
       
    96             Uses self.TYPE_REGEXES to do the matching
       
    97         """
       
    98 
       
    99         # empty line
       
   100         if not line :
       
   101             return
       
   102 
       
   103         # look for match
       
   104         match = type = None
       
   105 
       
   106         # test each type
       
   107         for type, regex in self.TYPE_REGEXES :
       
   108             # attempt to match
       
   109             match = regex.match(line)
       
   110             
       
   111             # found, break
       
   112             if match :
       
   113                 break
       
   114         
       
   115         # no match found?
       
   116         if not match :
       
   117             raise LogParseError(line, offset, "Line did not match any type")
       
   118         
       
   119         # match groups
       
   120         groups = match.groupdict(None)
       
   121 
       
   122         # parse timestamp
       
   123         if 'datetime' in groups :
       
   124             # parse datetime using default asctime() format
       
   125             dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y')
       
   126 
       
   127         elif 'timestamp' in groups :
       
   128             # parse timestamp into naive datetime
       
   129             dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt)
       
   130             
       
   131             # override date?
       
   132             if date :
       
   133                 dt = dt.replace(year=date.year, month=date.month, day=date.day)
       
   134 
       
   135         elif 'date' in groups :
       
   136             # parse date-only datetime
       
   137             dt = datetime.datetime.strptime(groups['date'], '%a %b %d %Y')
       
   138 
       
   139         else :
       
   140             # no timestamp !?
       
   141             raise LogParseError(line, offset, "No timestamp")
       
   142 
       
   143         # now localize with timezone
       
   144         dtz = self.tz.localize(dt)
       
   145 
       
   146         # channel, currently unused
       
   147         channel_name = (groups.get('channel') or groups.get('channel2'))
       
   148 
       
   149         # source
       
   150         if 'server1' in groups :
       
   151             source = (None, None, groups.get('server1'), None)
       
   152 
       
   153         else :
       
   154             source = (groups.get('nickname') or groups.get('nickname2'), groups.get('username'), groups.get('hostname'), groups.get('flags'))
       
   155 
       
   156         # target
       
   157         if 'server2' in groups :
       
   158             target = groups.get('server2')
       
   159 
       
   160         else :
       
   161             target = groups.get('target')
       
   162 
       
   163         # data
       
   164         if 'message' in groups :
       
   165             data = groups['message']
       
   166         
       
   167         elif 'mode' in groups :
       
   168             data = groups['mode']
       
   169 
       
   170         elif 'topic' in groups :
       
   171             data = groups['topic']
       
   172         
       
   173         elif 'nick_list' in groups :
       
   174             # split into components
       
   175             list = groups['nick_list'].split(', ')
       
   176             
       
   177             # additional count?
       
   178             if 'count' in groups and groups['count'] :
       
   179                 list.append('+%d' % int(groups['count']))
       
   180             
       
   181             # join
       
   182             data = ' '.join(list)
       
   183 
       
   184         else :
       
   185             data = None
       
   186         
       
   187         # custom types?
       
   188         if type == 'DAY_CHANGED' :
       
   189             # new date
       
   190             date = dtz
       
   191         
       
   192         else :
       
   193             # build+return (date, LogLine)
       
   194             return date, LogLine(channel, offset, type, dtz, source, target, data)
       
   195 
       
   196     def parse_lines (self, channel, lines, date=None, starting_offset=None) :
       
   197         """
       
   198             Parse the given lines, yielding LogEvents. 
       
   199         """
       
   200 
       
   201         for offset, line in enumerate(lines) :
       
   202             # offset?
       
   203             if starting_offset :
       
   204                 offset = starting_offset + offset
       
   205 
       
   206             else :
       
   207                 offset = None
       
   208             
       
   209             # try and parse
       
   210             try :
       
   211                 # get None or (date, line)
       
   212                 line_info = self.parse_line(channel, line, date, offset)
       
   213 
       
   214            # passthrough LogParseError's
       
   215             except LogParseError :
       
   216                 raise
       
   217             
       
   218             # wrap other errors as LogParseError
       
   219             except Exception, e :
       
   220                 raise LogParseError(line, offset, "Parsing line failed: %s" % e)
       
   221             
       
   222             else :
       
   223                 # nothing?
       
   224                 if not line_info :
       
   225                     continue
       
   226                 
       
   227                 # unpack, update date
       
   228                 date, line = line_info
       
   229                 
       
   230                 # yield
       
   231                 yield line
       
   232 
       
   233