1 import datetime, time |
|
2 import re |
|
3 |
|
4 import logging; log = logging.getLogger('pvl.syslog.parser') |
|
5 |
|
6 RFC3339_RE = re.compile(r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(\.\d+)?(Z|[+-]\d{2}:\d{2})?') |
|
7 RFC3339_FMT = '%Y-%m-%dT%H:%M:%S' |
|
8 |
|
9 def rfc3339 (timestamp) : |
|
10 """ |
|
11 RFC3339 timestamps as used in some syslog implementations. |
|
12 |
|
13 Returns a datetime in some random timezone, possibly localtime. |
|
14 """ |
|
15 |
|
16 match = RFC3339_RE.match(timestamp) |
|
17 |
|
18 if not match : |
|
19 return None |
|
20 |
|
21 # parts |
|
22 dt = datetime.datetime.strptime(match.group(1), RFC3339_FMT) |
|
23 tz = match.group(2) |
|
24 |
|
25 # TODO: timezone? |
|
26 return dt |
|
27 |
|
28 if not tz : |
|
29 # XXX: localtime |
|
30 return dt |
|
31 |
|
32 elif tz == 'Z' : |
|
33 # UTC |
|
34 pass |
|
35 |
|
36 elif tz[0] in '+-' : |
|
37 hours, minutes = tz[1:].split(':') |
|
38 td = datetime.timedelta(hours=int(hours), minutes=int(minutes)) |
|
39 |
|
40 if tz[0] == '-' : |
|
41 dt += td |
|
42 if tz[0] == '+' : |
|
43 dt -= td |
|
44 else : |
|
45 raise ValueError("Invalid timezone offset: %s" % timestamp) |
|
46 |
|
47 # XXX: UTC |
|
48 return dt |
|
49 |
|
50 RFC3164_RE = re.compile(r'\w{3} [0-9 ][0-9] \d{2}:\d{2}:\d{2}') |
|
51 RFC3164_FMT = '%b %d %H:%M:%S' |
|
52 RFC3164_PRE = '%Y ' # add missing year, assuming current |
|
53 |
|
54 def rfc3164 (timestamp) : |
|
55 """ |
|
56 Traditional BSD Syslog timestamps. |
|
57 |
|
58 Returns a datetime assumed to be in localtime. |
|
59 """ |
|
60 |
|
61 if not RFC3164_RE.match(timestamp) : |
|
62 return |
|
63 |
|
64 return datetime.datetime.strptime(time.strftime(RFC3164_PRE) + timestamp, RFC3164_PRE + RFC3164_FMT) |
|
65 |
|
66 class SyslogParser (object) : |
|
67 """ |
|
68 Parse syslog lines in text format, as used in logfiles/fifos. |
|
69 """ |
|
70 |
|
71 SEVERITIES = dict(enumerate(( |
|
72 'emerg', |
|
73 'alert', |
|
74 'crit', |
|
75 'err', |
|
76 'warning', |
|
77 'notice', |
|
78 'info', |
|
79 'debug', |
|
80 ))) |
|
81 |
|
82 FACILITIES = dict(enumerate(( |
|
83 'kern', # 0 |
|
84 'user', # 1 |
|
85 'mail', # 2 |
|
86 'daemon', # 3 |
|
87 'auth', # 4 |
|
88 'syslog', # 5 |
|
89 'lpr', # 6 |
|
90 'news', # 7 |
|
91 'uucp', # 8 |
|
92 'cron', # 9 |
|
93 'authpriv', # 10 |
|
94 'ftp', # 11 |
|
95 'ntp', # 12 |
|
96 'audit', # 13 |
|
97 'alert', # 14 |
|
98 'clock', # 15 |
|
99 'local0', # 16 |
|
100 'local1', # 17 |
|
101 'local2', # 18 |
|
102 'local3', # 19 |
|
103 'local4', # 20 |
|
104 'local5', # 21 |
|
105 'local6', # 22 |
|
106 'local7', # 23 |
|
107 ))) |
|
108 |
|
109 # default syslogd format |
|
110 SYSLOG_RE = re.compile( |
|
111 # the timestamp+hostname header |
|
112 # XXX: hostname may be missing |
|
113 # at least in Ubuntu 11.10 syslogd 'last message repeated 2 times'... |
|
114 r'(?:<(?P<pri>\d+|(?P<facility>\w+)\.(?P<severity>\w+))>)?' |
|
115 + r'(?P<timestamp>\w{3} [0-9 ][0-9] \d{2}:\d{2}:\d{2}|.+?) ' |
|
116 + r'(?P<hostname>\S+)? ' |
|
117 |
|
118 # the message, including possible tag/pid |
|
119 + r"(?P<message>(?P<tag>(?P<program>[^:\]]+)(?:\[(?P<pid>\d+)\])?: )?(?P<text>.*))\n?" |
|
120 ) |
|
121 |
|
122 def __init__ (self, raw=False, facility=None, severity=None) : |
|
123 """ |
|
124 Using given facility/severity as default. |
|
125 """ |
|
126 |
|
127 self.raw = raw |
|
128 self.facility = facility |
|
129 self.severity = severity |
|
130 |
|
131 def parse_pri (self, match) : |
|
132 """ |
|
133 Parse pri/facility/severity. |
|
134 """ |
|
135 |
|
136 pri = match.group('pri') |
|
137 facility = match.group('facility') or self.facility |
|
138 severity = match.group('severity') or self.severity |
|
139 |
|
140 if pri and pri.isdigit() : |
|
141 pri = int(pri) |
|
142 facility, severity = divmod(pri, 8) |
|
143 |
|
144 return dict( |
|
145 pri = pri, |
|
146 severity = self.SEVERITIES.get(severity, severity), |
|
147 facility = self.FACILITIES.get(facility, facility) |
|
148 ) |
|
149 |
|
150 def parse_timestamp (self, match) : |
|
151 """ |
|
152 Parse timstamp from line into datetime. |
|
153 """ |
|
154 |
|
155 timestamp = match.group('timestamp') |
|
156 |
|
157 # timestamp, in various formats |
|
158 try : |
|
159 return rfc3164(timestamp) or rfc3339(timestamp) |
|
160 |
|
161 except ValueError as ex: |
|
162 # skip it |
|
163 log.warning("timestamp: %s:", timestamp, exc_info=ex) |
|
164 return None |
|
165 |
|
166 def parse_prog (self, match) : |
|
167 """ |
|
168 Parse prog from line. |
|
169 """ |
|
170 |
|
171 prog = match.group('program') |
|
172 |
|
173 if prog : |
|
174 return prog |
|
175 else : |
|
176 # no tag |
|
177 return None |
|
178 |
|
179 def parse (self, line) : |
|
180 """ |
|
181 Parse given input line into SyslogMessage. |
|
182 """ |
|
183 |
|
184 # ignore whitespace |
|
185 line = line.strip() |
|
186 |
|
187 # timestamp? |
|
188 if self.raw : |
|
189 # from defaults |
|
190 return dict( |
|
191 timestamp = datetime.datetime.now(), # XXX: None? |
|
192 host = None, |
|
193 prog = None, |
|
194 pid = None, |
|
195 msg = line, |
|
196 ) |
|
197 |
|
198 else : |
|
199 # parse |
|
200 match = self.SYSLOG_RE.match(line) |
|
201 |
|
202 if not match : |
|
203 log.warn("Unparseable syslog message: %r", line) |
|
204 return |
|
205 |
|
206 # parse |
|
207 item = dict( |
|
208 timestamp = self.parse_timestamp(match), |
|
209 host = match.group('hostname'), |
|
210 prog = self.parse_prog(match), |
|
211 pid = match.group('pid'), |
|
212 msg = match.group('text'), |
|
213 ) |
|
214 |
|
215 # facility/severity prefix? |
|
216 item.update(self.parse_pri(match)) |
|
217 |
|
218 return item |
|
219 |
|
220 def process (self, lines) : |
|
221 """ |
|
222 Yield SyslogMessages from given series of lines. |
|
223 """ |
|
224 |
|
225 for line in lines : |
|
226 item = self.parse(line) |
|
227 |
|
228 log.debug("%s", item) |
|
229 |
|
230 if item : |
|
231 yield item |
|
232 |
|
233 __call__ = process |
|
234 |
|