terom@31: import datetime, time terom@31: import re terom@31: terom@31: import logging; log = logging.getLogger('pvl.syslog.parser') terom@31: terom@70: RFC3339_RE = re.compile(r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(\.\d+)?(Z|[+-]\d{2}:\d{2})?') terom@70: RFC3339_FMT = '%Y-%m-%dT%H:%M:%S' terom@70: terom@70: def rfc3339 (timestamp) : terom@70: """ terom@70: RFC3339 timestamps as used in some syslog implementations. terom@70: terom@70: Returns a datetime in some random timezone, possibly localtime. terom@70: """ terom@70: terom@70: match = RFC3339_RE.match(timestamp) terom@70: terom@70: if not match : terom@70: return None terom@70: terom@70: # parts terom@70: dt = datetime.datetime.strptime(match.group(1), RFC3339_FMT) terom@70: tz = match.group(2) terom@70: terom@70: # TODO: timezone? terom@70: return dt terom@70: terom@70: if not tz : terom@70: # XXX: localtime terom@70: return dt terom@70: terom@70: elif tz == 'Z' : terom@70: # UTC terom@70: pass terom@70: terom@70: elif tz[0] in '+-' : terom@70: hours, minutes = tz[1:].split(':') terom@70: td = datetime.timedelta(hours=int(hours), minutes=int(minutes)) terom@70: terom@70: if tz[0] == '-' : terom@70: dt += td terom@70: if tz[0] == '+' : terom@70: dt -= td terom@70: else : terom@70: raise ValueError("Invalid timezone offset: %s" % timestamp) terom@70: terom@70: # XXX: UTC terom@70: return dt terom@70: terom@70: RFC3164_RE = re.compile(r'\w{3} [0-9 ][0-9] \d{2}:\d{2}:\d{2}') terom@70: RFC3164_FMT = '%b %d %H:%M:%S' terom@70: RFC3164_PRE = '%Y ' # add missing year, assuming current terom@70: terom@70: def rfc3164 (timestamp) : terom@70: """ terom@70: Traditional BSD Syslog timestamps. terom@70: terom@70: Returns a datetime assumed to be in localtime. terom@70: """ terom@70: terom@70: if not RFC3164_RE.match(timestamp) : terom@70: return terom@70: terom@70: return datetime.datetime.strptime(time.strftime(RFC3164_PRE) + timestamp, RFC3164_PRE + RFC3164_FMT) terom@70: terom@31: class SyslogParser (object) : terom@31: """ terom@44: Parse syslog lines in text format, as used in logfiles/fifos. terom@31: """ terom@69: terom@69: SEVERITIES = dict(enumerate(( terom@69: 'emerg', terom@69: 'alert', terom@69: 'crit', terom@69: 'err', terom@69: 'warning', terom@69: 'notice', terom@69: 'info', terom@69: 'debug', terom@69: ))) terom@69: terom@69: FACILITIES = dict(enumerate(( terom@69: 'kern', # 0 terom@69: 'user', # 1 terom@69: 'mail', # 2 terom@69: 'daemon', # 3 terom@69: 'auth', # 4 terom@69: 'syslog', # 5 terom@69: 'lpr', # 6 terom@69: 'news', # 7 terom@69: 'uucp', # 8 terom@69: 'cron', # 9 terom@69: 'authpriv', # 10 terom@69: 'ftp', # 11 terom@69: 'ntp', # 12 terom@69: 'audit', # 13 terom@69: 'alert', # 14 terom@69: 'clock', # 15 terom@69: 'local0', # 16 terom@69: 'local1', # 17 terom@69: 'local2', # 18 terom@69: 'local3', # 19 terom@69: 'local4', # 20 terom@69: 'local5', # 21 terom@69: 'local6', # 22 terom@69: 'local7', # 23 terom@69: ))) terom@70: terom@31: # default syslogd format terom@31: SYSLOG_RE = re.compile( terom@31: # the timestamp+hostname header terom@67: # XXX: hostname may be missing terom@67: # at least in Ubuntu 11.10 syslogd 'last message repeated 2 times'... terom@69: r'(?:<(?P\d+|(?P\w+)\.(?P\w+))>)?' terom@70: + r'(?P\w{3} [0-9 ][0-9] \d{2}:\d{2}:\d{2}|.+?) ' terom@70: + r'(?P\S+)? ' terom@31: terom@31: # the message, including possible tag/pid terom@31: + r"(?P(?P(?P[^:\]]+)(?:\[(?P\d+)\])?: )?(?P.*))\n?" terom@31: ) terom@31: terom@101: def __init__ (self, raw=False, facility=None, severity=None) : terom@31: """ terom@101: Using given facility/severity as default. terom@31: """ terom@31: terom@31: self.raw = raw terom@74: self.facility = facility terom@101: self.severity = severity terom@31: terom@69: def parse_pri (self, match) : terom@69: """ terom@69: Parse pri/facility/severity. terom@69: """ terom@69: terom@69: pri = match.group('pri') terom@74: facility = match.group('facility') or self.facility terom@101: severity = match.group('severity') or self.severity terom@69: terom@74: if pri and pri.isdigit() : terom@69: pri = int(pri) terom@69: facility, severity = divmod(pri, 8) terom@69: terom@69: return dict( terom@69: pri = pri, terom@69: severity = self.SEVERITIES.get(severity, severity), terom@69: facility = self.FACILITIES.get(facility, facility) terom@69: ) terom@69: terom@31: def parse_timestamp (self, match) : terom@31: """ terom@31: Parse timstamp from line into datetime. terom@31: """ terom@31: terom@31: timestamp = match.group('timestamp') terom@31: terom@70: # timestamp, in various formats terom@70: try : terom@70: return rfc3164(timestamp) or rfc3339(timestamp) terom@70: terom@70: except ValueError as ex: terom@70: # skip it terom@70: log.warning("timestamp: %s:", timestamp, exc_info=ex) terom@70: return None terom@31: terom@31: def parse_prog (self, match) : terom@31: """ terom@31: Parse prog from line. terom@31: """ terom@31: terom@31: prog = match.group('program') terom@31: terom@44: if prog : terom@44: return prog terom@44: else : terom@31: # no tag terom@31: return None terom@31: terom@31: def parse (self, line) : terom@31: """ terom@31: Parse given input line into SyslogMessage. terom@31: """ terom@31: terom@31: # ignore whitespace terom@31: line = line.strip() terom@31: terom@31: # timestamp? terom@31: if self.raw : terom@31: # from defaults terom@31: return dict( terom@31: timestamp = datetime.datetime.now(), # XXX: None? terom@31: host = None, terom@44: prog = None, terom@31: pid = None, terom@31: msg = line, terom@31: ) terom@31: terom@31: else : terom@31: # parse terom@31: match = self.SYSLOG_RE.match(line) terom@31: terom@31: if not match : terom@31: log.warn("Unparseable syslog message: %r", line) terom@31: return terom@31: terom@31: # parse terom@69: item = dict( terom@31: timestamp = self.parse_timestamp(match), terom@31: host = match.group('hostname'), terom@31: prog = self.parse_prog(match), terom@31: pid = match.group('pid'), terom@31: msg = match.group('text'), terom@31: ) terom@70: terom@69: # facility/severity prefix? terom@74: item.update(self.parse_pri(match)) terom@69: terom@69: return item terom@31: terom@31: def process (self, lines) : terom@31: """ terom@31: Yield SyslogMessages from given series of lines. terom@31: """ terom@31: terom@31: for line in lines : terom@31: item = self.parse(line) terom@44: terom@74: log.debug("%s", item) terom@74: terom@44: if item : terom@31: yield item terom@31: terom@43: __call__ = process terom@31: