terom@41: """ terom@41: A source of IRC log files terom@41: """ terom@41: terom@41: import codecs terom@41: from datetime import date, datetime, timedelta terom@41: import pytz terom@41: terom@41: # for SEEK_*, errno terom@41: import os, errno terom@41: terom@41: class LogSource (object) : terom@41: """ terom@41: A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events terom@41: """ terom@41: terom@41: def get_latest (self, count) : terom@41: """ terom@41: Yield the latest events, up to `count` of them. terom@41: """ terom@41: terom@41: abstract terom@41: terom@41: class LogFile (LogSource) : terom@41: """ terom@41: A file containing LogEvents terom@41: """ terom@41: terom@41: def __init__ (self, path, charset='utf-8', sep='\n') : terom@41: """ terom@41: Open the file at the given path, which contains data of the given codec, as lines separated by the given separator terom@41: """ terom@41: terom@41: # store terom@41: self.path = path terom@41: self.charset = charset terom@41: self.sep = sep terom@41: terom@41: # open terom@41: self.file = codecs.open(path, 'r', charset) terom@41: terom@41: def __iter__ (self) : terom@41: """ terom@41: Yields a series of lines, as read from the top of the file terom@41: """ terom@41: terom@41: # seek to beginning terom@41: self.file.seek(0) terom@41: terom@41: # iterate over lines terom@41: return iter(self.file) terom@41: terom@41: def get_latest (self, count) : terom@41: """ terom@41: Returns up to lines from the end of the file, or less, if the file doesn't contain that many lines terom@41: """ terom@41: terom@41: # the list of lines terom@41: lines = [] terom@41: terom@41: # seek to end of file terom@41: self.file.seek(0, os.SEEK_END) terom@41: terom@41: # read offset terom@41: # XXX; why -2 ? terom@41: offset = self.file.tell() - 2 terom@41: terom@41: # use this blocksize terom@41: BLOCKSIZE = 1024 terom@41: terom@41: # trailing data terom@41: buf = '' terom@41: terom@41: # read a block at a time, backwards terom@41: while count > 0 and offset >= 0: terom@41: # update offset terom@41: offset -= BLOCKSIZE terom@41: terom@41: # normalize to zero terom@41: if offset < 0 : terom@41: offset = 0 terom@41: terom@41: # seek backwards one block terom@41: self.file.seek(offset) terom@41: terom@41: # add the new block to our buffer terom@41: read_buf = self.file.read(BLOCKSIZE) terom@41: terom@41: # make sure we got the right amount of data terom@41: assert len(read_buf) == BLOCKSIZE, "read(%d) -> %d" % (BLOCKSIZE, len(read_buf)) terom@41: terom@41: # add in our previous buf terom@41: buf = read_buf + buf terom@41: terom@41: # split out lines terom@41: buf_lines = buf.split(self.sep) terom@41: terom@41: # keep the first one as our buffer, as it's incomplete terom@41: buf = buf_lines[0] terom@41: terom@41: # add up to count lines to our lines buffer terom@41: lines = buf_lines[1:count + 1] + lines terom@41: terom@41: # update count terom@41: count -= (len(buf_lines) - 1) terom@41: terom@41: # return the line list terom@41: return lines terom@41: terom@41: class LogDirectory (LogSource) : terom@41: """ terom@41: A directory containing a series of timestamped LogFiles terom@41: """ terom@41: terom@41: def __init__ (self, path, tz, charset='utf-8', filename_fmt='%Y-%m-%d') : terom@41: """ terom@41: Load the logfiles at the given path. terom@41: terom@41: The files contain data in the given charset, and are named according the the date in the given timezone and terom@41: date format. terom@41: """ terom@41: terom@41: # store terom@41: self.path = path terom@41: self.tz = tz terom@41: self.charset = charset terom@41: self.filename_fmt = filename_fmt terom@41: terom@41: def _get_logfile_datetime (self, dt) : terom@41: """ terom@41: Get the logfile corresponding to the given datetime terom@41: """ terom@41: terom@41: # convert to target timezone terom@41: dtz = dt.astimezone(self.tz) terom@41: terom@41: # convert to date and use that terom@41: return self._get_logfile_date(dtz.date()) terom@41: terom@41: def _get_logfile_date (self, d) : terom@41: """ terom@41: Get the logfile corresponding to the given naive date in our timezone terom@41: """ terom@41: terom@41: # format filename terom@41: filename = d.strftime(self.filename_fmt) terom@41: terom@41: # build path terom@41: path = os.path.join(self.path, filename) terom@41: terom@41: # return the LogFile terom@41: return LogFile(path, self.charset) terom@41: terom@41: def _iter_backwards (self, dt=None) : terom@41: """ terom@41: Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the terom@41: given *datetime*, or the the current date, if none given terom@41: """ terom@41: terom@41: # default to now terom@41: if not dt : terom@41: dt = datetime.now(pytz.utc) terom@41: terom@41: # convert to target timezone terom@41: dtz = dt.astimezone(self.tz) terom@41: terom@41: # our timedelta terom@41: ONE_DAY = timedelta(1) terom@41: terom@41: # iterate unto infinity terom@41: while True : terom@41: # yield terom@41: yield dtz.date() terom@41: terom@41: # one day sdrawkcab terom@41: dtz -= ONE_DAY terom@41: terom@41: def get_latest (self, count) : terom@41: """ terom@41: Uses _iter_backwards + _get_logfile_date to read the yield the given lines from as many logfiles as needed terom@41: """ terom@41: terom@41: # iterate backwards from now terom@41: day_iter = self._iter_backwards() terom@41: terom@41: # number of files read terom@41: files = 0 terom@41: terom@41: # only read up to 100 files or so terom@41: MAX_FILES = 100 terom@41: terom@41: # loop until done terom@41: while count > 0 : terom@41: logfile = None terom@41: terom@41: try : terom@41: # get next logfile terom@41: files += 1 terom@41: terom@41: # open terom@41: logfile = self._get_logfile_date(day_iter.next()) terom@41: terom@41: except IOError, e : terom@41: # skip nonexistant days if we haven't found any logs yet terom@41: if e.errno != errno.ENOENT : terom@41: raise terom@41: terom@41: if files > MAX_FILES : terom@41: raise Exception("No recent logfiles found") terom@41: terom@41: else : terom@41: # skip to next day terom@41: continue terom@41: terom@41: # yield lines terom@41: for line in logfile.get_latest(count) : terom@41: # yield while we still need to, otherwise, stop terom@41: if count > 0 : terom@41: # decrement terom@41: count -= 1 terom@41: terom@41: yield line terom@41: terom@41: else : terom@41: break terom@41: terom@41: