pvl/syslog/parser.py
changeset 70 c8ec745a2aaa
parent 69 9da998198936
child 74 952ee07efd7a
equal deleted inserted replaced
69:9da998198936 70:c8ec745a2aaa
     1 import datetime, time
     1 import datetime, time
     2 import re
     2 import re
     3 
     3 
     4 import logging; log = logging.getLogger('pvl.syslog.parser')
     4 import logging; log = logging.getLogger('pvl.syslog.parser')
     5 
     5 
       
     6 RFC3339_RE = re.compile(r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(\.\d+)?(Z|[+-]\d{2}:\d{2})?')
       
     7 RFC3339_FMT = '%Y-%m-%dT%H:%M:%S'
       
     8 
       
     9 def rfc3339 (timestamp) :
       
    10     """
       
    11         RFC3339 timestamps as used in some syslog implementations.
       
    12 
       
    13         Returns a datetime in some random timezone, possibly localtime.
       
    14     """
       
    15 
       
    16     match = RFC3339_RE.match(timestamp)
       
    17 
       
    18     if not match :
       
    19         return None
       
    20     
       
    21     # parts
       
    22     dt = datetime.datetime.strptime(match.group(1), RFC3339_FMT)
       
    23     tz = match.group(2)
       
    24     
       
    25     # TODO: timezone?
       
    26     return dt
       
    27 
       
    28     if not tz :
       
    29         # XXX: localtime
       
    30         return dt
       
    31 
       
    32     elif tz == 'Z' :
       
    33         # UTC
       
    34         pass
       
    35 
       
    36     elif tz[0] in '+-' :
       
    37         hours, minutes = tz[1:].split(':')
       
    38         td = datetime.timedelta(hours=int(hours), minutes=int(minutes))
       
    39         
       
    40         if tz[0] == '-' :
       
    41             dt += td
       
    42         if tz[0] == '+' :
       
    43             dt -= td
       
    44     else :
       
    45         raise ValueError("Invalid timezone offset: %s" % timestamp)
       
    46 
       
    47     # XXX: UTC
       
    48     return dt
       
    49 
       
    50 RFC3164_RE = re.compile(r'\w{3} [0-9 ][0-9] \d{2}:\d{2}:\d{2}')
       
    51 RFC3164_FMT = '%b %d %H:%M:%S'
       
    52 RFC3164_PRE = '%Y ' # add missing year, assuming current
       
    53 
       
    54 def rfc3164 (timestamp) :
       
    55     """
       
    56         Traditional BSD Syslog timestamps.
       
    57 
       
    58         Returns a datetime assumed to be in localtime.
       
    59     """
       
    60 
       
    61     if not RFC3164_RE.match(timestamp) :
       
    62         return
       
    63 
       
    64     return datetime.datetime.strptime(time.strftime(RFC3164_PRE) + timestamp, RFC3164_PRE + RFC3164_FMT)
       
    65        
     6 class SyslogParser (object) :
    66 class SyslogParser (object) :
     7     """
    67     """
     8         Parse syslog lines in text format, as used in logfiles/fifos.
    68         Parse syslog lines in text format, as used in logfiles/fifos.
     9     """
    69     """
    10 
    70 
    43         'local4',   # 20
   103         'local4',   # 20
    44         'local5',   # 21
   104         'local5',   # 21
    45         'local6',   # 22
   105         'local6',   # 22
    46         'local7',   # 23
   106         'local7',   # 23
    47     )))
   107     )))
    48    
   108 
    49     # default syslogd format
   109     # default syslogd format
    50     SYSLOG_RE = re.compile(
   110     SYSLOG_RE = re.compile(
    51         # the timestamp+hostname header
   111         # the timestamp+hostname header
    52         # XXX:  hostname may be missing
   112         # XXX:  hostname may be missing
    53         #       at least in Ubuntu 11.10 syslogd 'last message repeated 2 times'...
   113         #       at least in Ubuntu 11.10 syslogd 'last message repeated 2 times'...
    54             r'(?:<(?P<pri>\d+|(?P<facility>\w+)\.(?P<severity>\w+))>)?'
   114             r'(?:<(?P<pri>\d+|(?P<facility>\w+)\.(?P<severity>\w+))>)?'
    55         +   r'(?P<timestamp>\w{3} [0-9 ][0-9] \d{2}:\d{2}:\d{2}) (?P<hostname>\S+)? '
   115         +   r'(?P<timestamp>\w{3} [0-9 ][0-9] \d{2}:\d{2}:\d{2}|.+?) '
       
   116         +   r'(?P<hostname>\S+)? '
    56 
   117 
    57         # the message, including possible tag/pid
   118         # the message, including possible tag/pid
    58         +   r"(?P<message>(?P<tag>(?P<program>[^:\]]+)(?:\[(?P<pid>\d+)\])?: )?(?P<text>.*))\n?"
   119         +   r"(?P<message>(?P<tag>(?P<program>[^:\]]+)(?:\[(?P<pid>\d+)\])?: )?(?P<text>.*))\n?"
    59     )
   120     )
    60 
       
    61     TIMESTAMP_FMT = '%b %d %H:%M:%S'
       
    62 
   121 
    63     def __init__ (self, raw=False) :
   122     def __init__ (self, raw=False) :
    64         """
   123         """
    65             Using given underlying line source.
   124             Using given underlying line source.
    66         """
   125         """
    90         """
   149         """
    91             Parse timstamp from line into datetime.
   150             Parse timstamp from line into datetime.
    92         """
   151         """
    93 
   152 
    94         timestamp = match.group('timestamp')
   153         timestamp = match.group('timestamp')
    95         
   154 
    96         # add missing year; assume current
   155         # timestamp, in various formats
    97         timestamp = time.strftime('%Y') + ' ' + timestamp
   156         try :
    98         
   157             return rfc3164(timestamp) or rfc3339(timestamp)
    99         # k
   158 
   100         timestamp = datetime.datetime.strptime(timestamp, '%Y ' + self.TIMESTAMP_FMT)
   159         except ValueError as ex:
   101 
   160             # skip it
   102         return timestamp
   161             log.warning("timestamp: %s:", timestamp, exc_info=ex)
       
   162             return None
   103 
   163 
   104     def parse_prog (self, match) :
   164     def parse_prog (self, match) :
   105         """
   165         """
   106             Parse prog from line.
   166             Parse prog from line.
   107         """
   167         """
   150                 host        = match.group('hostname'),
   210                 host        = match.group('hostname'),
   151                 prog        = self.parse_prog(match),
   211                 prog        = self.parse_prog(match),
   152                 pid         = match.group('pid'),
   212                 pid         = match.group('pid'),
   153                 msg         = match.group('text'),
   213                 msg         = match.group('text'),
   154             )
   214             )
   155             
   215            
   156             # facility/severity prefix?
   216             # facility/severity prefix?
   157             if match.group('pri') :
   217             if match.group('pri') :
   158                 item.update(self.parse_pri(match))
   218                 item.update(self.parse_pri(match))
   159 
   219 
   160             return item
   220             return item