|
1 """ |
|
2 Parse log data into log_events |
|
3 """ |
|
4 |
|
5 import re |
|
6 import datetime |
|
7 |
|
8 from log_line import LogTypes, LogLine |
|
9 |
|
10 class LogParseError (Exception) : |
|
11 """ |
|
12 Parsing some line failed |
|
13 """ |
|
14 |
|
15 def __init__ (self, line, offset, message) : |
|
16 super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message)) |
|
17 |
|
18 class LogParser (object) : |
|
19 """ |
|
20 Abstract interface |
|
21 """ |
|
22 |
|
23 def __init__ (self, tz, timestamp_fmt="%H:%M:%S") : |
|
24 """ |
|
25 Setup the parser to use the given format for line timestamps, which are of the given timezone |
|
26 """ |
|
27 |
|
28 self.tz = tz |
|
29 self.timestamp_fmt = timestamp_fmt |
|
30 |
|
31 def parse_lines (self, channel, lines, date=None, starting_offset=None) : |
|
32 """ |
|
33 Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline. |
|
34 |
|
35 Channel is the LogChannel that these lines belong to. |
|
36 |
|
37 Offset is the starting offset, and may be None to not use it. |
|
38 |
|
39 Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date |
|
40 information, event timestamps will have a date component of 1900/1/1. |
|
41 """ |
|
42 |
|
43 abstract |
|
44 |
|
45 class IrssiParser (LogParser) : |
|
46 """ |
|
47 A parser for irssi logfiles |
|
48 """ |
|
49 |
|
50 # timestamp prefix, with trailing space |
|
51 _TS = r'(?P<timestamp>[a-zA-Z0-9: ]+[a-zA-Z0-9])\s*' |
|
52 |
|
53 # subexpression parts |
|
54 _NICK = r'(?P<nickname>.+?)' |
|
55 _NICK2 = r'(?P<nickname2>.+?)' |
|
56 _TARGET = r'(?P<target>.+?)' |
|
57 _CHAN = r'(?P<channel>.+?)' |
|
58 _CHAN2 = r'(?P<channel2>.+?)' |
|
59 _USERHOST = r'(?P<username>.*?)@(?P<hostname>.*?)' |
|
60 _MSG = r'(?P<message>.*)' |
|
61 _SRV1 = r'(?P<server1>.+?)' |
|
62 _SRV2 = r'(?P<server2>.+?)' |
|
63 |
|
64 # regular expressions for matching lines, by type |
|
65 TYPE_EXPRS = ( |
|
66 ( LogTypes.LOG_OPEN, r'--- Log opened (?P<datetime>.+)' ), |
|
67 ( LogTypes.LOG_CLOSE, r'--- Log closed (?P<datetime>.+)' ), |
|
68 ( LogTypes.MSG, _TS + r'<(?P<flags>.)' + _NICK + '> ' + _MSG ), |
|
69 ( LogTypes.NOTICE, _TS + r'-' + _NICK + ':' + _CHAN + '- ' + _MSG ), |
|
70 ( LogTypes.ACTION, _TS + r'\* ' + _NICK + ' ' + _MSG ), |
|
71 ( LogTypes.JOIN, _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN ), |
|
72 ( LogTypes.PART, _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P<message>.*?)\]' ), |
|
73 ( LogTypes.KICK, _TS + r'-!- ' + _TARGET + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P<message>.*?)\]' ), |
|
74 # XXX: use hostname instead of nickname for ServerMode |
|
75 ( LogTypes.MODE, _TS + r'-!- (mode|ServerMode)/' + _CHAN + ' \[(?P<mode>.+?)\] by (?P<nickname>\S+)' ), |
|
76 ( LogTypes.NICK, _TS + r'-!- ' + _NICK + ' is now known as (?P<target>\S+)' ), |
|
77 ( LogTypes.QUIT, _TS + r'-!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P<message>.*?)\]' ), |
|
78 ( LogTypes.TOPIC, _TS + r'-!- (' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P<topic>.*)|Topic unset by ' + _NICK2 + ' on ' + _CHAN2 + ')' ), |
|
79 |
|
80 ( LogTypes.SELF_NOTICE, _TS + r'\[notice\(' + _CHAN + '\)\] ' + _MSG ), |
|
81 ( LogTypes.SELF_NICK, _TS + r'-!- You\'re now known as (?P<target>\S+)' ), |
|
82 |
|
83 ( LogTypes.NETSPLIT_START, _TS + r'-!- Netsplit ' + _SRV1 + ' <-> ' + _SRV2 + ' quits: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more,\S+\))?'), |
|
84 ( LogTypes.NETSPLIT_END, _TS + r'-!- Netsplit over, joins: (?P<nick_list>[^(]+)( \(\+(?P<count>\d+) more\))?' ), |
|
85 |
|
86 ( 'DAY_CHANGED', r'--- Day changed (?P<date>.+)' ), |
|
87 ) |
|
88 |
|
89 # precompile |
|
90 TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS] |
|
91 |
|
92 def parse_line (self, channel, line, date, offset=None) : |
|
93 """ |
|
94 Parse a single line, and return the resulting LogLine, or None, to ignore the line. |
|
95 |
|
96 Uses self.TYPE_REGEXES to do the matching |
|
97 """ |
|
98 |
|
99 # empty line |
|
100 if not line : |
|
101 return |
|
102 |
|
103 # look for match |
|
104 match = type = None |
|
105 |
|
106 # test each type |
|
107 for type, regex in self.TYPE_REGEXES : |
|
108 # attempt to match |
|
109 match = regex.match(line) |
|
110 |
|
111 # found, break |
|
112 if match : |
|
113 break |
|
114 |
|
115 # no match found? |
|
116 if not match : |
|
117 raise LogParseError(line, offset, "Line did not match any type") |
|
118 |
|
119 # match groups |
|
120 groups = match.groupdict(None) |
|
121 |
|
122 # parse timestamp |
|
123 if 'datetime' in groups : |
|
124 # parse datetime using default asctime() format |
|
125 dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y') |
|
126 |
|
127 elif 'timestamp' in groups : |
|
128 # parse timestamp into naive datetime |
|
129 dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt) |
|
130 |
|
131 # override date? |
|
132 if date : |
|
133 dt = dt.replace(year=date.year, month=date.month, day=date.day) |
|
134 |
|
135 elif 'date' in groups : |
|
136 # parse date-only datetime |
|
137 dt = datetime.datetime.strptime(groups['date'], '%a %b %d %Y') |
|
138 |
|
139 else : |
|
140 # no timestamp !? |
|
141 raise LogParseError(line, offset, "No timestamp") |
|
142 |
|
143 # now localize with timezone |
|
144 dtz = self.tz.localize(dt) |
|
145 |
|
146 # channel, currently unused |
|
147 channel_name = (groups.get('channel') or groups.get('channel2')) |
|
148 |
|
149 # source |
|
150 if 'server1' in groups : |
|
151 source = (None, None, groups.get('server1'), None) |
|
152 |
|
153 else : |
|
154 source = (groups.get('nickname') or groups.get('nickname2'), groups.get('username'), groups.get('hostname'), groups.get('flags')) |
|
155 |
|
156 # target |
|
157 if 'server2' in groups : |
|
158 target = groups.get('server2') |
|
159 |
|
160 else : |
|
161 target = groups.get('target') |
|
162 |
|
163 # data |
|
164 if 'message' in groups : |
|
165 data = groups['message'] |
|
166 |
|
167 elif 'mode' in groups : |
|
168 data = groups['mode'] |
|
169 |
|
170 elif 'topic' in groups : |
|
171 data = groups['topic'] |
|
172 |
|
173 elif 'nick_list' in groups : |
|
174 # split into components |
|
175 list = groups['nick_list'].split(', ') |
|
176 |
|
177 # additional count? |
|
178 if 'count' in groups and groups['count'] : |
|
179 list.append('+%d' % int(groups['count'])) |
|
180 |
|
181 # join |
|
182 data = ' '.join(list) |
|
183 |
|
184 else : |
|
185 data = None |
|
186 |
|
187 # custom types? |
|
188 if type == 'DAY_CHANGED' : |
|
189 # new date |
|
190 date = dtz |
|
191 |
|
192 else : |
|
193 # build+return (date, LogLine) |
|
194 return date, LogLine(channel, offset, type, dtz, source, target, data) |
|
195 |
|
196 def parse_lines (self, channel, lines, date=None, starting_offset=None) : |
|
197 """ |
|
198 Parse the given lines, yielding LogEvents. |
|
199 """ |
|
200 |
|
201 for offset, line in enumerate(lines) : |
|
202 # offset? |
|
203 if starting_offset : |
|
204 offset = starting_offset + offset |
|
205 |
|
206 else : |
|
207 offset = None |
|
208 |
|
209 # try and parse |
|
210 try : |
|
211 # get None or (date, line) |
|
212 line_info = self.parse_line(channel, line, date, offset) |
|
213 |
|
214 # passthrough LogParseError's |
|
215 except LogParseError : |
|
216 raise |
|
217 |
|
218 # wrap other errors as LogParseError |
|
219 except Exception, e : |
|
220 raise LogParseError(line, offset, "Parsing line failed: %s" % e) |
|
221 |
|
222 else : |
|
223 # nothing? |
|
224 if not line_info : |
|
225 continue |
|
226 |
|
227 # unpack, update date |
|
228 date, line = line_info |
|
229 |
|
230 # yield |
|
231 yield line |
|
232 |
|
233 |