pvl.syslog.parser: rfc3339 timestamp support
authorTero Marttila <terom@fixme.fi>
Sat, 05 Jan 2013 01:30:27 +0200
changeset 70 c8ec745a2aaa
parent 69 9da998198936
child 71 11b267e1b2b0
pvl.syslog.parser: rfc3339 timestamp support
pvl/syslog/parser.py
--- a/pvl/syslog/parser.py	Fri Jan 04 23:47:53 2013 +0200
+++ b/pvl/syslog/parser.py	Sat Jan 05 01:30:27 2013 +0200
@@ -3,6 +3,66 @@
 
 import logging; log = logging.getLogger('pvl.syslog.parser')
 
+RFC3339_RE = re.compile(r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(\.\d+)?(Z|[+-]\d{2}:\d{2})?')
+RFC3339_FMT = '%Y-%m-%dT%H:%M:%S'
+
+def rfc3339 (timestamp) :
+    """
+        RFC3339 timestamps as used in some syslog implementations.
+
+        Returns a datetime in some random timezone, possibly localtime.
+    """
+
+    match = RFC3339_RE.match(timestamp)
+
+    if not match :
+        return None
+    
+    # parts
+    dt = datetime.datetime.strptime(match.group(1), RFC3339_FMT)
+    tz = match.group(2)
+    
+    # TODO: timezone?
+    return dt
+
+    if not tz :
+        # XXX: localtime
+        return dt
+
+    elif tz == 'Z' :
+        # UTC
+        pass
+
+    elif tz[0] in '+-' :
+        hours, minutes = tz[1:].split(':')
+        td = datetime.timedelta(hours=int(hours), minutes=int(minutes))
+        
+        if tz[0] == '-' :
+            dt += td
+        if tz[0] == '+' :
+            dt -= td
+    else :
+        raise ValueError("Invalid timezone offset: %s" % timestamp)
+
+    # XXX: UTC
+    return dt
+
+RFC3164_RE = re.compile(r'\w{3} [0-9 ][0-9] \d{2}:\d{2}:\d{2}')
+RFC3164_FMT = '%b %d %H:%M:%S'
+RFC3164_PRE = '%Y ' # add missing year, assuming current
+
+def rfc3164 (timestamp) :
+    """
+        Traditional BSD Syslog timestamps.
+
+        Returns a datetime assumed to be in localtime.
+    """
+
+    if not RFC3164_RE.match(timestamp) :
+        return
+
+    return datetime.datetime.strptime(time.strftime(RFC3164_PRE) + timestamp, RFC3164_PRE + RFC3164_FMT)
+       
 class SyslogParser (object) :
     """
         Parse syslog lines in text format, as used in logfiles/fifos.
@@ -45,21 +105,20 @@
         'local6',   # 22
         'local7',   # 23
     )))
-   
+
     # default syslogd format
     SYSLOG_RE = re.compile(
         # the timestamp+hostname header
         # XXX:  hostname may be missing
         #       at least in Ubuntu 11.10 syslogd 'last message repeated 2 times'...
             r'(?:<(?P<pri>\d+|(?P<facility>\w+)\.(?P<severity>\w+))>)?'
-        +   r'(?P<timestamp>\w{3} [0-9 ][0-9] \d{2}:\d{2}:\d{2}) (?P<hostname>\S+)? '
+        +   r'(?P<timestamp>\w{3} [0-9 ][0-9] \d{2}:\d{2}:\d{2}|.+?) '
+        +   r'(?P<hostname>\S+)? '
 
         # the message, including possible tag/pid
         +   r"(?P<message>(?P<tag>(?P<program>[^:\]]+)(?:\[(?P<pid>\d+)\])?: )?(?P<text>.*))\n?"
     )
 
-    TIMESTAMP_FMT = '%b %d %H:%M:%S'
-
     def __init__ (self, raw=False) :
         """
             Using given underlying line source.
@@ -92,14 +151,15 @@
         """
 
         timestamp = match.group('timestamp')
-        
-        # add missing year; assume current
-        timestamp = time.strftime('%Y') + ' ' + timestamp
-        
-        # k
-        timestamp = datetime.datetime.strptime(timestamp, '%Y ' + self.TIMESTAMP_FMT)
 
-        return timestamp
+        # timestamp, in various formats
+        try :
+            return rfc3164(timestamp) or rfc3339(timestamp)
+
+        except ValueError as ex:
+            # skip it
+            log.warning("timestamp: %s:", timestamp, exc_info=ex)
+            return None
 
     def parse_prog (self, match) :
         """
@@ -152,7 +212,7 @@
                 pid         = match.group('pid'),
                 msg         = match.group('text'),
             )
-            
+           
             # facility/severity prefix?
             if match.group('pri') :
                 item.update(self.parse_pri(match))