pvl/syslog/parser.py
changeset 224 ed410776effd
parent 223 6842794c20e8
child 225 3c2d0dd42045
equal deleted inserted replaced
223:6842794c20e8 224:ed410776effd
     1 import datetime, time
       
     2 import re
       
     3 
       
     4 import logging; log = logging.getLogger('pvl.syslog.parser')
       
     5 
       
     6 RFC3339_RE = re.compile(r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(\.\d+)?(Z|[+-]\d{2}:\d{2})?')
       
     7 RFC3339_FMT = '%Y-%m-%dT%H:%M:%S'
       
     8 
       
     9 def rfc3339 (timestamp) :
       
    10     """
       
    11         RFC3339 timestamps as used in some syslog implementations.
       
    12 
       
    13         Returns a datetime in some random timezone, possibly localtime.
       
    14     """
       
    15 
       
    16     match = RFC3339_RE.match(timestamp)
       
    17 
       
    18     if not match :
       
    19         return None
       
    20     
       
    21     # parts
       
    22     dt = datetime.datetime.strptime(match.group(1), RFC3339_FMT)
       
    23     tz = match.group(2)
       
    24     
       
    25     # TODO: timezone?
       
    26     return dt
       
    27 
       
    28     if not tz :
       
    29         # XXX: localtime
       
    30         return dt
       
    31 
       
    32     elif tz == 'Z' :
       
    33         # UTC
       
    34         pass
       
    35 
       
    36     elif tz[0] in '+-' :
       
    37         hours, minutes = tz[1:].split(':')
       
    38         td = datetime.timedelta(hours=int(hours), minutes=int(minutes))
       
    39         
       
    40         if tz[0] == '-' :
       
    41             dt += td
       
    42         if tz[0] == '+' :
       
    43             dt -= td
       
    44     else :
       
    45         raise ValueError("Invalid timezone offset: %s" % timestamp)
       
    46 
       
    47     # XXX: UTC
       
    48     return dt
       
    49 
       
    50 RFC3164_RE = re.compile(r'\w{3} [0-9 ][0-9] \d{2}:\d{2}:\d{2}')
       
    51 RFC3164_FMT = '%b %d %H:%M:%S'
       
    52 RFC3164_PRE = '%Y ' # add missing year, assuming current
       
    53 
       
    54 def rfc3164 (timestamp) :
       
    55     """
       
    56         Traditional BSD Syslog timestamps.
       
    57 
       
    58         Returns a datetime assumed to be in localtime.
       
    59     """
       
    60 
       
    61     if not RFC3164_RE.match(timestamp) :
       
    62         return
       
    63 
       
    64     return datetime.datetime.strptime(time.strftime(RFC3164_PRE) + timestamp, RFC3164_PRE + RFC3164_FMT)
       
    65        
       
    66 class SyslogParser (object) :
       
    67     """
       
    68         Parse syslog lines in text format, as used in logfiles/fifos.
       
    69     """
       
    70 
       
    71     SEVERITIES = dict(enumerate((
       
    72         'emerg',
       
    73         'alert', 
       
    74         'crit', 
       
    75         'err',
       
    76         'warning',
       
    77         'notice',
       
    78         'info', 
       
    79         'debug',
       
    80     )))
       
    81 
       
    82     FACILITIES = dict(enumerate((
       
    83         'kern',     # 0
       
    84         'user',     # 1
       
    85         'mail',     # 2
       
    86         'daemon',   # 3
       
    87         'auth',     # 4
       
    88         'syslog',   # 5
       
    89         'lpr',      # 6
       
    90         'news',     # 7
       
    91         'uucp',     # 8
       
    92         'cron',     # 9
       
    93         'authpriv', # 10
       
    94         'ftp',      # 11
       
    95         'ntp',      # 12
       
    96         'audit',    # 13
       
    97         'alert',    # 14
       
    98         'clock',    # 15
       
    99         'local0',   # 16
       
   100         'local1',   # 17
       
   101         'local2',   # 18
       
   102         'local3',   # 19
       
   103         'local4',   # 20
       
   104         'local5',   # 21
       
   105         'local6',   # 22
       
   106         'local7',   # 23
       
   107     )))
       
   108 
       
   109     # default syslogd format
       
   110     SYSLOG_RE = re.compile(
       
   111         # the timestamp+hostname header
       
   112         # XXX:  hostname may be missing
       
   113         #       at least in Ubuntu 11.10 syslogd 'last message repeated 2 times'...
       
   114             r'(?:<(?P<pri>\d+|(?P<facility>\w+)\.(?P<severity>\w+))>)?'
       
   115         +   r'(?P<timestamp>\w{3} [0-9 ][0-9] \d{2}:\d{2}:\d{2}|.+?) '
       
   116         +   r'(?P<hostname>\S+)? '
       
   117 
       
   118         # the message, including possible tag/pid
       
   119         +   r"(?P<message>(?P<tag>(?P<program>[^:\]]+)(?:\[(?P<pid>\d+)\])?: )?(?P<text>.*))\n?"
       
   120     )
       
   121 
       
   122     def __init__ (self, raw=False, facility=None, severity=None) :
       
   123         """
       
   124             Using given facility/severity as default.
       
   125         """
       
   126 
       
   127         self.raw = raw
       
   128         self.facility = facility
       
   129         self.severity = severity
       
   130 
       
   131     def parse_pri (self, match) :
       
   132         """
       
   133             Parse pri/facility/severity.
       
   134         """
       
   135 
       
   136         pri = match.group('pri')
       
   137         facility = match.group('facility') or self.facility
       
   138         severity = match.group('severity') or self.severity
       
   139         
       
   140         if pri and pri.isdigit() :
       
   141             pri = int(pri)
       
   142             facility, severity = divmod(pri, 8)
       
   143 
       
   144         return dict(
       
   145             pri         = pri,
       
   146             severity    = self.SEVERITIES.get(severity, severity),
       
   147             facility    = self.FACILITIES.get(facility, facility)
       
   148         )
       
   149 
       
   150     def parse_timestamp (self, match) :
       
   151         """
       
   152             Parse timstamp from line into datetime.
       
   153         """
       
   154 
       
   155         timestamp = match.group('timestamp')
       
   156 
       
   157         # timestamp, in various formats
       
   158         try :
       
   159             return rfc3164(timestamp) or rfc3339(timestamp)
       
   160 
       
   161         except ValueError as ex:
       
   162             # skip it
       
   163             log.warning("timestamp: %s:", timestamp, exc_info=ex)
       
   164             return None
       
   165 
       
   166     def parse_prog (self, match) :
       
   167         """
       
   168             Parse prog from line.
       
   169         """
       
   170 
       
   171         prog = match.group('program')
       
   172 
       
   173         if prog :
       
   174             return prog
       
   175         else :
       
   176             # no tag
       
   177             return None
       
   178 
       
   179     def parse (self, line) :
       
   180         """
       
   181             Parse given input line into SyslogMessage.
       
   182         """
       
   183 
       
   184         # ignore whitespace
       
   185         line = line.strip()
       
   186 
       
   187         # timestamp?
       
   188         if self.raw :
       
   189             # from defaults
       
   190             return dict(
       
   191                 timestamp   = datetime.datetime.now(), # XXX: None?
       
   192                 host        = None,
       
   193                 prog        = None,
       
   194                 pid         = None,
       
   195                 msg         = line,
       
   196             )
       
   197 
       
   198         else :
       
   199             # parse
       
   200             match = self.SYSLOG_RE.match(line)
       
   201 
       
   202             if not match :
       
   203                 log.warn("Unparseable syslog message: %r", line)
       
   204                 return
       
   205 
       
   206             # parse
       
   207             item = dict(
       
   208                 timestamp   = self.parse_timestamp(match),
       
   209                 host        = match.group('hostname'),
       
   210                 prog        = self.parse_prog(match),
       
   211                 pid         = match.group('pid'),
       
   212                 msg         = match.group('text'),
       
   213             )
       
   214            
       
   215             # facility/severity prefix?
       
   216             item.update(self.parse_pri(match))
       
   217 
       
   218             return item
       
   219     
       
   220     def process (self, lines) :
       
   221         """
       
   222             Yield SyslogMessages from given series of lines.
       
   223         """
       
   224 
       
   225         for line in lines :
       
   226             item = self.parse(line)
       
   227 
       
   228             log.debug("%s", item)
       
   229 
       
   230             if item :
       
   231                 yield item
       
   232 
       
   233     __call__ = process
       
   234