pvl/syslog/parser.py
author Tero Marttila <terom@paivola.fi>
Sat, 08 Mar 2014 14:37:20 +0200
changeset 30 1053e69664a6
parent 2 5a8a32cbc944
permissions -rw-r--r--
pvl.invoke: setenv
import datetime, time
import re

import logging; log = logging.getLogger('pvl.syslog.parser')

RFC3339_RE = re.compile(r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(\.\d+)?(Z|[+-]\d{2}:\d{2})?')
RFC3339_FMT = '%Y-%m-%dT%H:%M:%S'

def rfc3339 (timestamp) :
    """
        RFC3339 timestamps as used in some syslog implementations.

        Returns a datetime in some random timezone, possibly localtime.
    """

    match = RFC3339_RE.match(timestamp)

    if not match :
        return None
    
    # parts
    dt = datetime.datetime.strptime(match.group(1), RFC3339_FMT)
    tz = match.group(2)
    
    # TODO: timezone?
    return dt

    if not tz :
        # XXX: localtime
        return dt

    elif tz == 'Z' :
        # UTC
        pass

    elif tz[0] in '+-' :
        hours, minutes = tz[1:].split(':')
        td = datetime.timedelta(hours=int(hours), minutes=int(minutes))
        
        if tz[0] == '-' :
            dt += td
        if tz[0] == '+' :
            dt -= td
    else :
        raise ValueError("Invalid timezone offset: %s" % timestamp)

    # XXX: UTC
    return dt

RFC3164_RE = re.compile(r'\w{3} [0-9 ][0-9] \d{2}:\d{2}:\d{2}')
RFC3164_FMT = '%b %d %H:%M:%S'
RFC3164_PRE = '%Y ' # add missing year, assuming current

def rfc3164 (timestamp) :
    """
        Traditional BSD Syslog timestamps.

        Returns a datetime assumed to be in localtime.
    """

    if not RFC3164_RE.match(timestamp) :
        return

    return datetime.datetime.strptime(time.strftime(RFC3164_PRE) + timestamp, RFC3164_PRE + RFC3164_FMT)
       
class SyslogParser (object) :
    """
        Parse syslog lines in text format, as used in logfiles/fifos.
    """

    SEVERITIES = dict(enumerate((
        'emerg',
        'alert', 
        'crit', 
        'err',
        'warning',
        'notice',
        'info', 
        'debug',
    )))

    FACILITIES = dict(enumerate((
        'kern',     # 0
        'user',     # 1
        'mail',     # 2
        'daemon',   # 3
        'auth',     # 4
        'syslog',   # 5
        'lpr',      # 6
        'news',     # 7
        'uucp',     # 8
        'cron',     # 9
        'authpriv', # 10
        'ftp',      # 11
        'ntp',      # 12
        'audit',    # 13
        'alert',    # 14
        'clock',    # 15
        'local0',   # 16
        'local1',   # 17
        'local2',   # 18
        'local3',   # 19
        'local4',   # 20
        'local5',   # 21
        'local6',   # 22
        'local7',   # 23
    )))

    # default syslogd format
    SYSLOG_RE = re.compile(
        # the timestamp+hostname header
        # XXX:  hostname may be missing
        #       at least in Ubuntu 11.10 syslogd 'last message repeated 2 times'...
            r'(?:<(?P<pri>\d+|(?P<facility>\w+)\.(?P<severity>\w+))>)?'
        +   r'(?P<timestamp>\w{3} [0-9 ][0-9] \d{2}:\d{2}:\d{2}|.+?) '
        +   r'(?P<hostname>\S+)? '

        # the message, including possible tag/pid
        +   r"(?P<message>(?P<tag>(?P<program>[^:\]]+)(?:\[(?P<pid>\d+)\])?: )?(?P<text>.*))\n?"
    )

    def __init__ (self, raw=False, facility=None, severity=None) :
        """
            Using given facility/severity as default.
        """

        self.raw = raw
        self.facility = facility
        self.severity = severity

    def parse_pri (self, match) :
        """
            Parse pri/facility/severity.
        """

        pri = match.group('pri')
        facility = match.group('facility') or self.facility
        severity = match.group('severity') or self.severity
        
        if pri and pri.isdigit() :
            pri = int(pri)
            facility, severity = divmod(pri, 8)

        return dict(
            pri         = pri,
            severity    = self.SEVERITIES.get(severity, severity),
            facility    = self.FACILITIES.get(facility, facility)
        )

    def parse_timestamp (self, match) :
        """
            Parse timstamp from line into datetime.
        """

        timestamp = match.group('timestamp')

        # timestamp, in various formats
        try :
            return rfc3164(timestamp) or rfc3339(timestamp)

        except ValueError as ex:
            # skip it
            log.warning("timestamp: %s:", timestamp, exc_info=ex)
            return None

    def parse_prog (self, match) :
        """
            Parse prog from line.
        """

        prog = match.group('program')

        if prog :
            return prog
        else :
            # no tag
            return None

    def parse (self, line) :
        """
            Parse given input line into SyslogMessage.
        """

        # ignore whitespace
        line = line.strip()

        # timestamp?
        if self.raw :
            # from defaults
            return dict(
                timestamp   = datetime.datetime.now(), # XXX: None?
                host        = None,
                prog        = None,
                pid         = None,
                msg         = line,
            )

        else :
            # parse
            match = self.SYSLOG_RE.match(line)

            if not match :
                log.warn("Unparseable syslog message: %r", line)
                return

            # parse
            item = dict(
                timestamp   = self.parse_timestamp(match),
                host        = match.group('hostname'),
                prog        = self.parse_prog(match),
                pid         = match.group('pid'),
                msg         = match.group('text'),
            )
           
            # facility/severity prefix?
            item.update(self.parse_pri(match))

            return item
    
    def process (self, lines) :
        """
            Yield SyslogMessages from given series of lines.
        """

        for line in lines :
            item = self.parse(line)

            log.debug("%s", item)

            if item :
                yield item

    __call__ = process