1 """ |
|
2 A source of IRC log files |
|
3 """ |
|
4 |
|
5 import datetime, calendar, itertools, functools, math |
|
6 import os, os.path, errno |
|
7 import pytz |
|
8 |
|
9 import config, utils |
|
10 |
|
11 # a timedelta that represents one day |
|
12 ONE_DAY = datetime.timedelta(days=1) |
|
13 |
|
14 class LogSourceDecoder (object) : |
|
15 """ |
|
16 Handles decoding of LogSource lines |
|
17 """ |
|
18 |
|
19 def __init__ (self, encoding_list) : |
|
20 """ |
|
21 Will try each of the given (charset, errors) items in turn, until one succeeds |
|
22 """ |
|
23 |
|
24 self.encoding_list = encoding_list |
|
25 |
|
26 def decode (self, line) : |
|
27 """ |
|
28 Decode the line of str() text into an unicode object |
|
29 """ |
|
30 |
|
31 # list of errors encountered |
|
32 error_list = [] |
|
33 |
|
34 # try each in turn |
|
35 for charset, errors in self.encoding_list : |
|
36 # trap UnicodeDecodeError to try with the next one |
|
37 try : |
|
38 return line.decode(charset, errors) |
|
39 |
|
40 except UnicodeDecodeError, e : |
|
41 error_list.append("%s:%s - %s" % (charset, errors, e)) |
|
42 continue |
|
43 |
|
44 # failure |
|
45 raise UnicodeDecodeError("Failed to decode line: %r: %s" % (line, ', '.join(error_list))) |
|
46 |
|
47 class LogSource (object) : |
|
48 """ |
|
49 A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events |
|
50 """ |
|
51 |
|
52 def __init__ (self, decoder, channel=None) : |
|
53 """ |
|
54 The appropriate LogChannel must be given, as we need to be able to construct the LogLines. If it is not yet |
|
55 known, then it can be given as None, and set later with bind_channel. |
|
56 |
|
57 Uses the given LogSourceDecoder to decode the lines. |
|
58 """ |
|
59 |
|
60 self.channel = channel |
|
61 self.decoder = decoder |
|
62 |
|
63 def bind_channel (self, channel) : |
|
64 """ |
|
65 Set this source's channel, where None was set before |
|
66 """ |
|
67 |
|
68 assert not self.channel |
|
69 |
|
70 self.channel = channel |
|
71 |
|
72 def get_latest (self, count) : |
|
73 """ |
|
74 Yield the latest events, up to `count` of them. |
|
75 """ |
|
76 |
|
77 abstract |
|
78 |
|
79 def get_date (self, dt) : |
|
80 """ |
|
81 Get logs for the given date (as a datetime). |
|
82 """ |
|
83 |
|
84 abstract |
|
85 |
|
86 def get_date_paged (self, dt, count, page=None) : |
|
87 """ |
|
88 Get the logs for a given date (as a datetime), divided into pages of count each. If page is given, the time |
|
89 portion of the dt is ignored, and the lines for the given page are returned. Otherwise, if page is None, |
|
90 then the lines for the page containing the given timestamp is returned. |
|
91 |
|
92 The return value is a (page, max, lines) tuple. |
|
93 """ |
|
94 |
|
95 # how to act? |
|
96 if page : |
|
97 # constant skip |
|
98 skip = (page - 1) * count |
|
99 |
|
100 else : |
|
101 skip = None |
|
102 |
|
103 # go through the logs a page at a time |
|
104 this_page = 1 |
|
105 |
|
106 # last line's timestamp |
|
107 last_ts = None |
|
108 |
|
109 # found it yet? |
|
110 found = False |
|
111 |
|
112 # count the full number of lines |
|
113 line_count = 0 |
|
114 |
|
115 # collect lines |
|
116 lines = [] |
|
117 |
|
118 # iterate using get_date |
|
119 for line in self.get_date(dt) : |
|
120 # count them |
|
121 line_count += 1 |
|
122 |
|
123 # skip? |
|
124 if skip : |
|
125 skip -= 1 |
|
126 continue |
|
127 |
|
128 # is this page all that we want/need? |
|
129 if page or found : |
|
130 # already full? |
|
131 if len(lines) >= count : |
|
132 continue |
|
133 |
|
134 # specfic timestamp |
|
135 else : |
|
136 # didn't find it in this page? |
|
137 if len(lines) >= count : |
|
138 # reset to next page |
|
139 lines = [] |
|
140 this_page += 1 |
|
141 |
|
142 # is dt between these two timestamps? |
|
143 if (not last_ts or last_ts <= dt) and (dt <= line.timestamp) : |
|
144 # found! |
|
145 found = True |
|
146 page = this_page |
|
147 |
|
148 else : |
|
149 # keep looking |
|
150 last_ts = line.timestamp |
|
151 |
|
152 # store line |
|
153 lines.append(line) |
|
154 |
|
155 # calculate max_pages |
|
156 max_pages = math.ceil(float(line_count) / count) |
|
157 |
|
158 # return |
|
159 return (page, max_pages, lines) |
|
160 |
|
161 def get_month_days (self, dt) : |
|
162 """ |
|
163 Return an ordered sequence of dates, telling which days in the given month (as a datetime) have logs available. |
|
164 """ |
|
165 |
|
166 abstract |
|
167 |
|
168 def get_modified (self, dt=None, after=None, until=None) : |
|
169 """ |
|
170 Returns a sequence of LogLines that may have been *modified* from their old values since the given datetime. |
|
171 |
|
172 If the datetime is not given, *all* lines are returned. |
|
173 |
|
174 If after is given, only lines from said date onwards will be returned, regardless of modification. |
|
175 If until is given, only lines up to and including said date will be returned, regardless of modification. |
|
176 |
|
177 The LogLines should be in time order. |
|
178 """ |
|
179 |
|
180 abstract |
|
181 |
|
182 def get_prev_date (self, dt) : |
|
183 """ |
|
184 Get the next distinct date of logs available preceeding the given date, or None |
|
185 """ |
|
186 |
|
187 abstract |
|
188 |
|
189 def get_next_date (self, dt) : |
|
190 """ |
|
191 Get the next distinct date of logs following the given date, or None. |
|
192 """ |
|
193 |
|
194 abstract |
|
195 |
|
196 class LogFile (object) : |
|
197 """ |
|
198 A file containing LogEvents |
|
199 |
|
200 XXX: modify to implement LogSource? |
|
201 """ |
|
202 |
|
203 def __init__ (self, path, parser, decoder, channel=None, start_date=None, sep='\n') : |
|
204 """ |
|
205 Open the file at the given path, which contains lines as separated by the given separator. Lines are |
|
206 decoded using the given LogSourceDecoder, and then parsed using the given parser, using the given date |
|
207 as the initial date for this log's first line. |
|
208 |
|
209 XXX: currently we assume start_date also for the end of the file |
|
210 """ |
|
211 |
|
212 # store |
|
213 self.channel = channel |
|
214 self.path = path |
|
215 self.parser = parser |
|
216 self.start_date = start_date |
|
217 self.decoder = decoder |
|
218 self.sep = sep |
|
219 |
|
220 # open |
|
221 self.file = open(path, 'rb') |
|
222 |
|
223 def __iter__ (self) : |
|
224 """ |
|
225 Yields a series of unicode lines, as read from the top of the file |
|
226 """ |
|
227 |
|
228 # seek to beginning |
|
229 self.file.seek(0) |
|
230 |
|
231 # iterate over lines, decoding them as well |
|
232 return (self.decoder.decode(line.rstrip(self.sep)) for line in self.file) |
|
233 |
|
234 def read_full (self) : |
|
235 """ |
|
236 Reads all LogLines. The LogLines will have a valid offset. |
|
237 """ |
|
238 |
|
239 # just use our __iter__ |
|
240 return self.parser.parse_lines(self.channel, self, self.start_date, starting_offset=1) |
|
241 |
|
242 def read_from (self, dt) : |
|
243 """ |
|
244 Reads all LogLines from the given naive timestamp onwards |
|
245 """ |
|
246 |
|
247 # start reading at beginning |
|
248 events = self.read_full() |
|
249 |
|
250 # skip unwanted events |
|
251 for event in events : |
|
252 if event.timestamp < dt : |
|
253 continue |
|
254 |
|
255 else : |
|
256 # include this line as well |
|
257 yield event |
|
258 break |
|
259 |
|
260 # yield the rest as-is |
|
261 for event in events : |
|
262 yield event |
|
263 |
|
264 def read_until (self, dt) : |
|
265 """ |
|
266 Reads all LogLines up until the given naive timestamp |
|
267 """ |
|
268 |
|
269 # start reading events at the beginning |
|
270 events = self.read_full() |
|
271 |
|
272 # yield events until we hit the given timestamp |
|
273 for event in events : |
|
274 if event.timestamp <= dt : |
|
275 yield event |
|
276 |
|
277 else : |
|
278 break |
|
279 |
|
280 # ignore the rest |
|
281 return |
|
282 |
|
283 def _read_blocks_reverse (self, blocksize=1024) : |
|
284 """ |
|
285 Yields blocks of file data in reverse order, starting at the end of the file |
|
286 """ |
|
287 |
|
288 # seek to end of file |
|
289 self.file.seek(0, os.SEEK_END) |
|
290 |
|
291 # read offset |
|
292 # XXX: hack -1 to get rid of trailing newline |
|
293 size = offset = self.file.tell() - 1 |
|
294 |
|
295 # do not try to read past the beginning of the file |
|
296 while offset > 0: |
|
297 # calc new offset + size |
|
298 if offset > blocksize : |
|
299 # full block |
|
300 offset -= blocksize |
|
301 read_size = blocksize |
|
302 |
|
303 else : |
|
304 # partial block |
|
305 read_size = offset |
|
306 offset = 0 |
|
307 |
|
308 # seek to offset |
|
309 self.file.seek(offset) |
|
310 |
|
311 # read the data we want |
|
312 block = self.file.read(read_size) |
|
313 |
|
314 # sanity check |
|
315 assert len(block) == read_size |
|
316 |
|
317 # yield |
|
318 yield block |
|
319 |
|
320 def _read_lines_reverse (self) : |
|
321 """ |
|
322 Yields decoded lines from the end of the file, in reverse order. |
|
323 """ |
|
324 |
|
325 # partial lines |
|
326 buf = '' |
|
327 |
|
328 # read from end of file, a block at a time |
|
329 for block in self._read_blocks_reverse() : |
|
330 # add in our previous buf |
|
331 buf = block + buf |
|
332 |
|
333 # split up lines |
|
334 lines = buf.split(self.sep) |
|
335 |
|
336 # keep the first one as our buffer, as it's incomplete |
|
337 buf = lines[0] |
|
338 |
|
339 # yield the rest a line at a time in reverse order... this looks weird, but that's how slicing works :) |
|
340 # XXX: use something like islice, this has to build a slice object |
|
341 for line in lines[:0:-1] : |
|
342 yield self.decoder.decode(line) |
|
343 |
|
344 def read_latest (self, count) : |
|
345 """ |
|
346 Returns up to count events, from the end of the file, or less, if the file doesn't contain that many lines. |
|
347 """ |
|
348 |
|
349 # the list of lines |
|
350 lines = [] |
|
351 |
|
352 # start reading lines into lines |
|
353 for line in self._read_lines_reverse() : |
|
354 # append |
|
355 lines.append(line) |
|
356 |
|
357 # done? |
|
358 if len(lines) >= count : |
|
359 break |
|
360 |
|
361 # decode in reverse order, using our starting date.... |
|
362 # XXX: use lines[::-1] or reversed? |
|
363 # XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that |
|
364 return self.parser.parse_lines(self.channel, reversed(lines), self.start_date) |
|
365 |
|
366 class LogDirectory (LogSource) : |
|
367 """ |
|
368 A directory containing a series of timestamped LogFiles |
|
369 """ |
|
370 |
|
371 def __init__ (self, path, tz, parser, decoder, filename_fmt, channel=None) : |
|
372 """ |
|
373 Load the logfiles at the given path, which are for the given LogChannel |
|
374 |
|
375 Decode the file lines using the given decoder, the files are named according the the date in the given |
|
376 timezone and date format, and will be parsed using the given parser. |
|
377 """ |
|
378 |
|
379 # store |
|
380 self.channel = channel |
|
381 self.path = path |
|
382 self.tz = tz |
|
383 self.parser = parser |
|
384 self.decoder = decoder |
|
385 self.filename_fmt = filename_fmt |
|
386 |
|
387 def _get_logfile_date (self, d, load=True, mtime=False, ignore_missing=False) : |
|
388 """ |
|
389 Get the logfile corresponding to the given naive date in our timezone. |
|
390 |
|
391 If load is False, only test for the presence of the logfile, do not actually open it. If mtime is given, |
|
392 then this returns the file's mtime |
|
393 |
|
394 Returns None if the logfile does not exist, unless ignore_missing is given as False. |
|
395 """ |
|
396 |
|
397 # format filename |
|
398 filename = d.strftime(self.filename_fmt) |
|
399 |
|
400 # build path |
|
401 path = os.path.join(self.path, filename) |
|
402 |
|
403 try : |
|
404 if load : |
|
405 # open+return the LogFile |
|
406 return LogFile(path, self.parser, self.decoder, start_date=d, channel=self.channel) |
|
407 |
|
408 elif mtime : |
|
409 # stat |
|
410 return utils.mtime(path) |
|
411 |
|
412 else : |
|
413 # test |
|
414 return os.path.exists(path) |
|
415 |
|
416 # XXX: move to LogFile |
|
417 except IOError, e : |
|
418 # return None for missing files |
|
419 if e.errno == errno.ENOENT and ignore_missing : |
|
420 return None |
|
421 |
|
422 else : |
|
423 raise |
|
424 |
|
425 def _iter_logfile_dates (self, after=None, until=None, reverse=False) : |
|
426 """ |
|
427 Yields a series of naive datetime objects representing the logfiles that are available, in time order. |
|
428 |
|
429 Parameters : |
|
430 after only dates from said date onwards will be returned |
|
431 until only dates up to and including said date will be returned |
|
432 reverse the dates are returned in reverse order instead. Note that the meaning of after/until doesn't change |
|
433 """ |
|
434 |
|
435 # convert timestamps to our timezone's dates |
|
436 if after : |
|
437 after = after.astimezone(self.tz).date() |
|
438 |
|
439 if until : |
|
440 until = until.astimezone(self.tz).date() |
|
441 |
|
442 # listdir |
|
443 filenames = os.listdir(self.path) |
|
444 |
|
445 # sort |
|
446 filenames.sort(reverse=reverse) |
|
447 |
|
448 # iter files |
|
449 for filename in filenames : |
|
450 try : |
|
451 # parse date |
|
452 dt = self.tz.localize(datetime.datetime.strptime(filename, self.filename_fmt)) |
|
453 date = dt.date() |
|
454 |
|
455 except : |
|
456 # ignore |
|
457 continue |
|
458 |
|
459 else : |
|
460 if (after and date < after) or (until and date > until) : |
|
461 # ignore |
|
462 continue |
|
463 |
|
464 else : |
|
465 # yield |
|
466 yield dt |
|
467 |
|
468 def _iter_date_reverse (self, dt=None) : |
|
469 """ |
|
470 Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the |
|
471 given *datetime*, or the the current date, if none given |
|
472 """ |
|
473 |
|
474 # default to now |
|
475 if not dt : |
|
476 dtz = self.tz.localize(datetime.datetime.now()) |
|
477 |
|
478 else : |
|
479 # convert to target timezone |
|
480 dtz = dt.astimezone(self.tz) |
|
481 |
|
482 # iterate unto infinity |
|
483 while True : |
|
484 # yield |
|
485 yield dtz.date() |
|
486 |
|
487 # one day sdrawkcab |
|
488 dtz -= ONE_DAY |
|
489 |
|
490 def _iter_logfile_reverse (self, dt=None, max_files=100) : |
|
491 """ |
|
492 Yields a series of LogFile objects, iterating backwards in time starting at the given datetime, or the |
|
493 current date, if none given. |
|
494 |
|
495 Reads/probes at most max_files files. |
|
496 """ |
|
497 |
|
498 # start counting at zero... |
|
499 file_count = 0 |
|
500 |
|
501 # have we found any files at all so far? |
|
502 have_found = False |
|
503 |
|
504 # iterate backwards over days |
|
505 for day in self._iter_date_reverse(dt) : |
|
506 # stop if we've handled enough files by now |
|
507 if file_count > max_files : |
|
508 break |
|
509 |
|
510 # try and open the next logfile |
|
511 logfile = None |
|
512 |
|
513 file_count += 1 |
|
514 logfile = self._get_logfile_date(day, ignore_missing=True) |
|
515 |
|
516 # no logfile there? |
|
517 if not logfile : |
|
518 # hit our limit? |
|
519 if file_count > max_files : |
|
520 # if we didn't find any logfiles at all, terminate rudely |
|
521 if not have_found : |
|
522 raise Exception("No recent logfiles found") |
|
523 |
|
524 else : |
|
525 # stop looking, deal with what we've got |
|
526 return |
|
527 |
|
528 else : |
|
529 # skip to next day |
|
530 continue |
|
531 |
|
532 # mark have_found |
|
533 have_found = True |
|
534 |
|
535 # yield it |
|
536 yield logfile |
|
537 |
|
538 def get_latest (self, count) : |
|
539 """ |
|
540 Uses _logfile_reverse to read the yield the given lines from as many logfiles as needed |
|
541 """ |
|
542 |
|
543 # read the events into here |
|
544 lines = [] |
|
545 |
|
546 # start reading in those logfiles |
|
547 for logfile in self._iter_logfile_reverse() : |
|
548 # read the events |
|
549 # XXX: use a queue |
|
550 lines = list(logfile.read_latest(count)) + lines |
|
551 |
|
552 # done? |
|
553 if len(lines) >= count : |
|
554 break |
|
555 |
|
556 # return the events |
|
557 return lines |
|
558 |
|
559 def get_date (self, dt) : |
|
560 """ |
|
561 A 'day' is considered to be a 24-hour period from 00:00:00 23:59:59. If the timezone of the given datetime |
|
562 differs from our native datetime, this may involve lines from more than one logfile. |
|
563 """ |
|
564 |
|
565 # begin/end of 24h period, in target timezone |
|
566 dtz_begin = dt.replace(hour=0, minute=0, second=0).astimezone(self.tz) |
|
567 dtz_end = dt.replace(hour=23, minute=59, second=59, microsecond=999999).astimezone(self.tz) |
|
568 |
|
569 # as dates |
|
570 d_begin = dtz_begin.date() |
|
571 d_end = dtz_end.date() |
|
572 |
|
573 # print |
|
574 # print "LogDirectory.get_date - %s" % dt |
|
575 # print "\t %s %s" % (d_begin, dtz_begin) |
|
576 # print "\t-> %s %s" % (d_end, dtz_end) |
|
577 |
|
578 # if they're the same, just pull the full log for that date |
|
579 if d_begin == d_end : |
|
580 # open that log |
|
581 logfile = self._get_logfile_date(d_begin) |
|
582 |
|
583 # return the full data |
|
584 return logfile.read_full() |
|
585 |
|
586 # otherwise, we need to pull two partial logs |
|
587 else : |
|
588 # open both of them, but it's okay if we don't have the second one |
|
589 f_begin = self._get_logfile_date(d_begin) |
|
590 f_end = self._get_logfile_date(d_end, ignore_missing=True) |
|
591 |
|
592 # chain together the two sources |
|
593 return itertools.chain( |
|
594 f_begin.read_from(dtz_begin), |
|
595 f_end.read_until(dtz_end) if f_end else [] |
|
596 ) |
|
597 |
|
598 def _iter_month_days (self, month) : |
|
599 """ |
|
600 Iterates over the days of a month as dt objects with time=0 |
|
601 """ |
|
602 |
|
603 # there's at most 31 days in a month... |
|
604 for day in xrange(1, 32) : |
|
605 try : |
|
606 # try and build the datetime |
|
607 dt = datetime.datetime(month.year, month.month, day) |
|
608 |
|
609 except : |
|
610 # stop |
|
611 return |
|
612 |
|
613 else : |
|
614 # fix timezones + yield |
|
615 yield month.tzinfo.localize(dt) |
|
616 |
|
617 def get_month_days (self, month) : |
|
618 """ |
|
619 Returns a set of dates for which logfiles are available in the given datetime's month |
|
620 """ |
|
621 |
|
622 # iterate over month's days |
|
623 for dt in self._iter_month_days(month) : |
|
624 # date in our target timezone |
|
625 log_date = dt.astimezone(self.tz).date() |
|
626 |
|
627 # test for it |
|
628 if self._get_logfile_date(log_date, load=False, ignore_missing=True) : |
|
629 # valid |
|
630 yield dt.date() |
|
631 |
|
632 def get_modified (self, dt=None, after=None, until=None) : |
|
633 """ |
|
634 Returns the contents off all logfiles with mtimes past the given date |
|
635 """ |
|
636 |
|
637 # iterate through all available logfiles in date order, as datetimes, from the given date on |
|
638 for log_date in self._iter_logfile_dates(after, until) : |
|
639 # compare against dt? |
|
640 if dt : |
|
641 # stat |
|
642 mtime = self._get_logfile_date(log_date, load=False, mtime=True, ignore_missing=True) |
|
643 |
|
644 # not modified? |
|
645 if mtime < dt : |
|
646 # skip |
|
647 continue |
|
648 |
|
649 # open |
|
650 logfile = self._get_logfile_date(log_date) |
|
651 |
|
652 # yield all lines |
|
653 for line in logfile.read_full() : |
|
654 yield line |
|
655 |
|
656 def get_prev_date (self, dt) : |
|
657 """ |
|
658 Just use _iter_logfile_dates |
|
659 """ |
|
660 |
|
661 # use for to "iter" once |
|
662 for log_date in self._iter_logfile_dates(until=dt - ONE_DAY, reverse=True) : |
|
663 return log_date |
|
664 |
|
665 else : |
|
666 return None |
|
667 |
|
668 def get_next_date (self, dt) : |
|
669 """ |
|
670 Just use _iter_logfile_dates |
|
671 """ |
|
672 |
|
673 # use for to "iter" once |
|
674 for log_date in self._iter_logfile_dates(after=dt + ONE_DAY) : |
|
675 return log_date |
|
676 |
|
677 else : |
|
678 return None |
|
679 |
|