diff -r aaa62c8e5bd5 -r f13cf27a360b log_source.py --- a/log_source.py Sun Feb 08 04:59:22 2009 +0200 +++ b/log_source.py Mon Feb 09 00:24:13 2009 +0200 @@ -2,13 +2,10 @@ A source of IRC log files """ -import codecs -from datetime import date, datetime, timedelta +import datetime, itertools +import os, errno import pytz -# for SEEK_*, errno -import os, errno - class LogSource (object) : """ A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events @@ -20,19 +17,30 @@ """ abstract + + def get_date (self, dt) : + """ + Get logs for the given date (as a datetime) + """ + + abstract class LogFile (LogSource) : """ A file containing LogEvents """ - def __init__ (self, path, charset='utf-8', sep='\n') : + def __init__ (self, path, parser, start_date=None, charset='utf-8', sep='\n') : """ - Open the file at the given path, which contains data of the given codec, as lines separated by the given separator + Open the file at the given path, which contains data with the given charset, as lines separated by the + given separator. Lines are parsed using the given parser, using the given date as an initial date, see + LogParser for more info. XXX: currently we assume start_date also for the end of the file """ # store self.path = path + self.parser = parser + self.start_date = start_date self.charset = charset self.sep = sep @@ -41,22 +49,68 @@ def __iter__ (self) : """ - Yields a series of lines, as read from the top of the file + Yields a series of unicode lines, as read from the top of the file """ # seek to beginning self.file.seek(0) - # iterate over lines - return iter(self.file) + # iterate over lines, decoding them as well + return (line.decode(self.charset) for line in self.file) - def get_latest (self, count) : + def read_full (self) : """ - Returns up to lines from the end of the file, or less, if the file doesn't contain that many lines + Reads all LogLines + """ + + # just use our __iter__ + return self.parser.parse_lines(self, self.start_date) + + def read_from (self, dt) : + """ + Reads all LogLines from the given naive timestamp onwards + """ + + # start reading at beginning + events = self.read_full() + + # skip unwanted events + for event in events : + if event.timestamp < dt : + continue + + else : + # include this line as well + yield event + break + + # yield the rest as-is + for event in events : + yield event + + def read_until (self, dt) : + """ + Reads all LogLines up until the given naive timestamp """ - # the list of lines - lines = [] + # start reading events at the beginning + events = self.read_full() + + # yield events until we hit the given timestamp + for event in events : + if event.timestamp <= dt : + yield event + + else : + break + + # ignore the rest + return + + def _read_blocks_reverse (self, blocksize=1024) : + """ + Yields blocks of file data in reverse order, starting at the end of the file + """ # seek to end of file self.file.seek(0, os.SEEK_END) @@ -64,20 +118,14 @@ # read offset # XXX: hack -1 to get rid of trailing newline size = offset = self.file.tell() - 1 - - # use this blocksize - BLOCKSIZE = 1024 - - # trailing data - buf = '' - - # read a block at a time, backwards - while len(lines) < count and offset > 0: + + # do not try to read past the beginning of the file + while offset > 0: # calc new offset + size - if offset > BLOCKSIZE : + if offset > blocksize : # full block - offset -= BLOCKSIZE - read_size = BLOCKSIZE + offset -= blocksize + read_size = blocksize else : # partial block @@ -88,47 +136,77 @@ self.file.seek(offset) # read the data we want - read_buf = self.file.read(read_size) - read_len = len(read_buf) + block = self.file.read(read_size) # sanity check - assert read_len == read_size + assert len(block) == read_size + # yield + yield block + + def _read_lines_reverse (self) : + """ + Yields decoded lines from the end of the file, in reverse order. + """ + + # partial lines + buf = '' + + # read from end of file, a block at a time + for block in self._read_blocks_reverse() : # add in our previous buf - buf = read_buf + buf + buf = block + buf - # split out lines - buf_lines = buf.split(self.sep) + # split up lines + lines = buf.split(self.sep) # keep the first one as our buffer, as it's incomplete - buf = buf_lines[0] + buf = lines[0] + + # yield the rest a line at a time in reverse order... this looks weird, but that's how slicing works :) + # XXX: use something like islice, this has to build a slice object + for line in lines[:0:-1] : + yield line.decode(self.charset) - # prepend up to count lines from the end to our lines buffer - lines = buf_lines[-min(count, len(buf_lines) - 1):] + lines + def get_latest (self, count) : + """ + Returns up to count events, from the end of the file, or less, if the file doesn't contain that many lines. + """ + + # the list of lines + lines = [] + + # start reading lines into lines + for line in self._read_lines_reverse() : + # append + lines.append(line) + + # done? + if len(lines) >= count : + break - # decode - # XXX: better queue implementation, plz - lines = [line.decode(self.charset) for line in lines] - - # return the line list - return lines + # decode in reverse order, using our starting date.... + # XXX: use lines[::-1] or reversed? + # XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that + return self.parser.parse_lines(reversed(lines), self.start_date) class LogDirectory (LogSource) : """ A directory containing a series of timestamped LogFiles """ - def __init__ (self, path, tz, charset='utf-8', filename_fmt='%Y-%m-%d') : + def __init__ (self, path, tz, parser, charset='utf-8', filename_fmt='%Y-%m-%d') : """ Load the logfiles at the given path. The files contain data in the given charset, and are named according the the date in the given timezone and - date format. + date format, and will be parsed using the given parser. """ # store self.path = path self.tz = tz + self.parser = parser self.charset = charset self.filename_fmt = filename_fmt @@ -155,9 +233,9 @@ path = os.path.join(self.path, filename) # return the LogFile - return LogFile(path, self.charset) + return LogFile(path, self.parser, d, self.charset) - def _iter_backwards (self, dt=None) : + def _iter_date_reverse (self, dt=None) : """ Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the given *datetime*, or the the current date, if none given @@ -165,13 +243,13 @@ # default to now if not dt : - dt = datetime.now(pytz.utc) + dt = datetime.datetime.now(pytz.utc) # convert to target timezone dtz = dt.astimezone(self.tz) # our timedelta - ONE_DAY = timedelta(1) + ONE_DAY = datetime.timedelta(1) # iterate unto infinity while True : @@ -187,7 +265,7 @@ """ # iterate backwards from now - day_iter = self._iter_backwards() + day_iter = self._iter_date_reverse() # number of files read files = 0 @@ -195,7 +273,7 @@ # only read up to 100 files or so MAX_FILES = 100 - # read the lines into here + # read the events into here lines = [] # loop until done @@ -221,9 +299,37 @@ # skip to next day continue - # read the lines - lines = logfile.get_latest(count) + lines + # read the events + # XXX: use a queue + lines = list(logfile.get_latest(count)) + lines - # return the lines + # return the events return lines + def get_date (self, dt) : + """ + A 'day' is considered to be a 24-hour period from 00:00:00 23:59:59. If the timezone of the given datetime + differs from our native datetime, this may involve lines from more than one logfile. + """ + + # begin/end of 24h period, in target timezone + dtz_begin = dt.replace(hour=0, minute=0, second=0).astimezone(self.tz) + dtz_end = dt.replace(hour=23, minute=59, second=59, microsecond=999999).astimezone(self.tz) + + # as dates + d_begin = dtz_begin.date() + d_end = dtz_end.date() + + # if they're the same, just pull the full log for that date + if d_begin == d_end : + return self._get_logfile_date(d_begin).read_full() + + # otherwise, we need to pull two partial logs + else : + # open both of them + f_begin = self._get_logfile_date(d_begin) + f_end = self._get_logfile_date(d_end) + + # chain together the two sources + return itertools.chain(f_begin.read_from(dtz_begin), f_end.read_until(dtz_end)) +