diff -r 9c7769850195 -r 6db2527b67cf log_source.py --- a/log_source.py Sun Sep 13 00:49:55 2009 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,679 +0,0 @@ -""" - A source of IRC log files -""" - -import datetime, calendar, itertools, functools, math -import os, os.path, errno -import pytz - -import config, utils - -# a timedelta that represents one day -ONE_DAY = datetime.timedelta(days=1) - -class LogSourceDecoder (object) : - """ - Handles decoding of LogSource lines - """ - - def __init__ (self, encoding_list) : - """ - Will try each of the given (charset, errors) items in turn, until one succeeds - """ - - self.encoding_list = encoding_list - - def decode (self, line) : - """ - Decode the line of str() text into an unicode object - """ - - # list of errors encountered - error_list = [] - - # try each in turn - for charset, errors in self.encoding_list : - # trap UnicodeDecodeError to try with the next one - try : - return line.decode(charset, errors) - - except UnicodeDecodeError, e : - error_list.append("%s:%s - %s" % (charset, errors, e)) - continue - - # failure - raise UnicodeDecodeError("Failed to decode line: %r: %s" % (line, ', '.join(error_list))) - -class LogSource (object) : - """ - A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events - """ - - def __init__ (self, decoder, channel=None) : - """ - The appropriate LogChannel must be given, as we need to be able to construct the LogLines. If it is not yet - known, then it can be given as None, and set later with bind_channel. - - Uses the given LogSourceDecoder to decode the lines. - """ - - self.channel = channel - self.decoder = decoder - - def bind_channel (self, channel) : - """ - Set this source's channel, where None was set before - """ - - assert not self.channel - - self.channel = channel - - def get_latest (self, count) : - """ - Yield the latest events, up to `count` of them. - """ - - abstract - - def get_date (self, dt) : - """ - Get logs for the given date (as a datetime). - """ - - abstract - - def get_date_paged (self, dt, count, page=None) : - """ - Get the logs for a given date (as a datetime), divided into pages of count each. If page is given, the time - portion of the dt is ignored, and the lines for the given page are returned. Otherwise, if page is None, - then the lines for the page containing the given timestamp is returned. - - The return value is a (page, max, lines) tuple. - """ - - # how to act? - if page : - # constant skip - skip = (page - 1) * count - - else : - skip = None - - # go through the logs a page at a time - this_page = 1 - - # last line's timestamp - last_ts = None - - # found it yet? - found = False - - # count the full number of lines - line_count = 0 - - # collect lines - lines = [] - - # iterate using get_date - for line in self.get_date(dt) : - # count them - line_count += 1 - - # skip? - if skip : - skip -= 1 - continue - - # is this page all that we want/need? - if page or found : - # already full? - if len(lines) >= count : - continue - - # specfic timestamp - else : - # didn't find it in this page? - if len(lines) >= count : - # reset to next page - lines = [] - this_page += 1 - - # is dt between these two timestamps? - if (not last_ts or last_ts <= dt) and (dt <= line.timestamp) : - # found! - found = True - page = this_page - - else : - # keep looking - last_ts = line.timestamp - - # store line - lines.append(line) - - # calculate max_pages - max_pages = math.ceil(float(line_count) / count) - - # return - return (page, max_pages, lines) - - def get_month_days (self, dt) : - """ - Return an ordered sequence of dates, telling which days in the given month (as a datetime) have logs available. - """ - - abstract - - def get_modified (self, dt=None, after=None, until=None) : - """ - Returns a sequence of LogLines that may have been *modified* from their old values since the given datetime. - - If the datetime is not given, *all* lines are returned. - - If after is given, only lines from said date onwards will be returned, regardless of modification. - If until is given, only lines up to and including said date will be returned, regardless of modification. - - The LogLines should be in time order. - """ - - abstract - - def get_prev_date (self, dt) : - """ - Get the next distinct date of logs available preceeding the given date, or None - """ - - abstract - - def get_next_date (self, dt) : - """ - Get the next distinct date of logs following the given date, or None. - """ - - abstract - -class LogFile (object) : - """ - A file containing LogEvents - - XXX: modify to implement LogSource? - """ - - def __init__ (self, path, parser, decoder, channel=None, start_date=None, sep='\n') : - """ - Open the file at the given path, which contains lines as separated by the given separator. Lines are - decoded using the given LogSourceDecoder, and then parsed using the given parser, using the given date - as the initial date for this log's first line. - - XXX: currently we assume start_date also for the end of the file - """ - - # store - self.channel = channel - self.path = path - self.parser = parser - self.start_date = start_date - self.decoder = decoder - self.sep = sep - - # open - self.file = open(path, 'rb') - - def __iter__ (self) : - """ - Yields a series of unicode lines, as read from the top of the file - """ - - # seek to beginning - self.file.seek(0) - - # iterate over lines, decoding them as well - return (self.decoder.decode(line.rstrip(self.sep)) for line in self.file) - - def read_full (self) : - """ - Reads all LogLines. The LogLines will have a valid offset. - """ - - # just use our __iter__ - return self.parser.parse_lines(self.channel, self, self.start_date, starting_offset=1) - - def read_from (self, dt) : - """ - Reads all LogLines from the given naive timestamp onwards - """ - - # start reading at beginning - events = self.read_full() - - # skip unwanted events - for event in events : - if event.timestamp < dt : - continue - - else : - # include this line as well - yield event - break - - # yield the rest as-is - for event in events : - yield event - - def read_until (self, dt) : - """ - Reads all LogLines up until the given naive timestamp - """ - - # start reading events at the beginning - events = self.read_full() - - # yield events until we hit the given timestamp - for event in events : - if event.timestamp <= dt : - yield event - - else : - break - - # ignore the rest - return - - def _read_blocks_reverse (self, blocksize=1024) : - """ - Yields blocks of file data in reverse order, starting at the end of the file - """ - - # seek to end of file - self.file.seek(0, os.SEEK_END) - - # read offset - # XXX: hack -1 to get rid of trailing newline - size = offset = self.file.tell() - 1 - - # do not try to read past the beginning of the file - while offset > 0: - # calc new offset + size - if offset > blocksize : - # full block - offset -= blocksize - read_size = blocksize - - else : - # partial block - read_size = offset - offset = 0 - - # seek to offset - self.file.seek(offset) - - # read the data we want - block = self.file.read(read_size) - - # sanity check - assert len(block) == read_size - - # yield - yield block - - def _read_lines_reverse (self) : - """ - Yields decoded lines from the end of the file, in reverse order. - """ - - # partial lines - buf = '' - - # read from end of file, a block at a time - for block in self._read_blocks_reverse() : - # add in our previous buf - buf = block + buf - - # split up lines - lines = buf.split(self.sep) - - # keep the first one as our buffer, as it's incomplete - buf = lines[0] - - # yield the rest a line at a time in reverse order... this looks weird, but that's how slicing works :) - # XXX: use something like islice, this has to build a slice object - for line in lines[:0:-1] : - yield self.decoder.decode(line) - - def read_latest (self, count) : - """ - Returns up to count events, from the end of the file, or less, if the file doesn't contain that many lines. - """ - - # the list of lines - lines = [] - - # start reading lines into lines - for line in self._read_lines_reverse() : - # append - lines.append(line) - - # done? - if len(lines) >= count : - break - - # decode in reverse order, using our starting date.... - # XXX: use lines[::-1] or reversed? - # XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that - return self.parser.parse_lines(self.channel, reversed(lines), self.start_date) - -class LogDirectory (LogSource) : - """ - A directory containing a series of timestamped LogFiles - """ - - def __init__ (self, path, tz, parser, decoder, filename_fmt, channel=None) : - """ - Load the logfiles at the given path, which are for the given LogChannel - - Decode the file lines using the given decoder, the files are named according the the date in the given - timezone and date format, and will be parsed using the given parser. - """ - - # store - self.channel = channel - self.path = path - self.tz = tz - self.parser = parser - self.decoder = decoder - self.filename_fmt = filename_fmt - - def _get_logfile_date (self, d, load=True, mtime=False, ignore_missing=False) : - """ - Get the logfile corresponding to the given naive date in our timezone. - - If load is False, only test for the presence of the logfile, do not actually open it. If mtime is given, - then this returns the file's mtime - - Returns None if the logfile does not exist, unless ignore_missing is given as False. - """ - - # format filename - filename = d.strftime(self.filename_fmt) - - # build path - path = os.path.join(self.path, filename) - - try : - if load : - # open+return the LogFile - return LogFile(path, self.parser, self.decoder, start_date=d, channel=self.channel) - - elif mtime : - # stat - return utils.mtime(path) - - else : - # test - return os.path.exists(path) - - # XXX: move to LogFile - except IOError, e : - # return None for missing files - if e.errno == errno.ENOENT and ignore_missing : - return None - - else : - raise - - def _iter_logfile_dates (self, after=None, until=None, reverse=False) : - """ - Yields a series of naive datetime objects representing the logfiles that are available, in time order. - - Parameters : - after only dates from said date onwards will be returned - until only dates up to and including said date will be returned - reverse the dates are returned in reverse order instead. Note that the meaning of after/until doesn't change - """ - - # convert timestamps to our timezone's dates - if after : - after = after.astimezone(self.tz).date() - - if until : - until = until.astimezone(self.tz).date() - - # listdir - filenames = os.listdir(self.path) - - # sort - filenames.sort(reverse=reverse) - - # iter files - for filename in filenames : - try : - # parse date - dt = self.tz.localize(datetime.datetime.strptime(filename, self.filename_fmt)) - date = dt.date() - - except : - # ignore - continue - - else : - if (after and date < after) or (until and date > until) : - # ignore - continue - - else : - # yield - yield dt - - def _iter_date_reverse (self, dt=None) : - """ - Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the - given *datetime*, or the the current date, if none given - """ - - # default to now - if not dt : - dtz = self.tz.localize(datetime.datetime.now()) - - else : - # convert to target timezone - dtz = dt.astimezone(self.tz) - - # iterate unto infinity - while True : - # yield - yield dtz.date() - - # one day sdrawkcab - dtz -= ONE_DAY - - def _iter_logfile_reverse (self, dt=None, max_files=100) : - """ - Yields a series of LogFile objects, iterating backwards in time starting at the given datetime, or the - current date, if none given. - - Reads/probes at most max_files files. - """ - - # start counting at zero... - file_count = 0 - - # have we found any files at all so far? - have_found = False - - # iterate backwards over days - for day in self._iter_date_reverse(dt) : - # stop if we've handled enough files by now - if file_count > max_files : - break - - # try and open the next logfile - logfile = None - - file_count += 1 - logfile = self._get_logfile_date(day, ignore_missing=True) - - # no logfile there? - if not logfile : - # hit our limit? - if file_count > max_files : - # if we didn't find any logfiles at all, terminate rudely - if not have_found : - raise Exception("No recent logfiles found") - - else : - # stop looking, deal with what we've got - return - - else : - # skip to next day - continue - - # mark have_found - have_found = True - - # yield it - yield logfile - - def get_latest (self, count) : - """ - Uses _logfile_reverse to read the yield the given lines from as many logfiles as needed - """ - - # read the events into here - lines = [] - - # start reading in those logfiles - for logfile in self._iter_logfile_reverse() : - # read the events - # XXX: use a queue - lines = list(logfile.read_latest(count)) + lines - - # done? - if len(lines) >= count : - break - - # return the events - return lines - - def get_date (self, dt) : - """ - A 'day' is considered to be a 24-hour period from 00:00:00 23:59:59. If the timezone of the given datetime - differs from our native datetime, this may involve lines from more than one logfile. - """ - - # begin/end of 24h period, in target timezone - dtz_begin = dt.replace(hour=0, minute=0, second=0).astimezone(self.tz) - dtz_end = dt.replace(hour=23, minute=59, second=59, microsecond=999999).astimezone(self.tz) - - # as dates - d_begin = dtz_begin.date() - d_end = dtz_end.date() - -# print -# print "LogDirectory.get_date - %s" % dt -# print "\t %s %s" % (d_begin, dtz_begin) -# print "\t-> %s %s" % (d_end, dtz_end) - - # if they're the same, just pull the full log for that date - if d_begin == d_end : - # open that log - logfile = self._get_logfile_date(d_begin) - - # return the full data - return logfile.read_full() - - # otherwise, we need to pull two partial logs - else : - # open both of them, but it's okay if we don't have the second one - f_begin = self._get_logfile_date(d_begin) - f_end = self._get_logfile_date(d_end, ignore_missing=True) - - # chain together the two sources - return itertools.chain( - f_begin.read_from(dtz_begin), - f_end.read_until(dtz_end) if f_end else [] - ) - - def _iter_month_days (self, month) : - """ - Iterates over the days of a month as dt objects with time=0 - """ - - # there's at most 31 days in a month... - for day in xrange(1, 32) : - try : - # try and build the datetime - dt = datetime.datetime(month.year, month.month, day) - - except : - # stop - return - - else : - # fix timezones + yield - yield month.tzinfo.localize(dt) - - def get_month_days (self, month) : - """ - Returns a set of dates for which logfiles are available in the given datetime's month - """ - - # iterate over month's days - for dt in self._iter_month_days(month) : - # date in our target timezone - log_date = dt.astimezone(self.tz).date() - - # test for it - if self._get_logfile_date(log_date, load=False, ignore_missing=True) : - # valid - yield dt.date() - - def get_modified (self, dt=None, after=None, until=None) : - """ - Returns the contents off all logfiles with mtimes past the given date - """ - - # iterate through all available logfiles in date order, as datetimes, from the given date on - for log_date in self._iter_logfile_dates(after, until) : - # compare against dt? - if dt : - # stat - mtime = self._get_logfile_date(log_date, load=False, mtime=True, ignore_missing=True) - - # not modified? - if mtime < dt : - # skip - continue - - # open - logfile = self._get_logfile_date(log_date) - - # yield all lines - for line in logfile.read_full() : - yield line - - def get_prev_date (self, dt) : - """ - Just use _iter_logfile_dates - """ - - # use for to "iter" once - for log_date in self._iter_logfile_dates(until=dt - ONE_DAY, reverse=True) : - return log_date - - else : - return None - - def get_next_date (self, dt) : - """ - Just use _iter_logfile_dates - """ - - # use for to "iter" once - for log_date in self._iter_logfile_dates(after=dt + ONE_DAY) : - return log_date - - else : - return None -