diff -r 9c7769850195 -r 6db2527b67cf qmsk/irclogs/log_source.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qmsk/irclogs/log_source.py Sun Sep 13 01:15:56 2009 +0300 @@ -0,0 +1,679 @@ +""" + A source of IRC log files +""" + +import datetime, calendar, itertools, functools, math +import os, os.path, errno +import pytz + +import config, utils + +# a timedelta that represents one day +ONE_DAY = datetime.timedelta(days=1) + +class LogSourceDecoder (object) : + """ + Handles decoding of LogSource lines + """ + + def __init__ (self, encoding_list) : + """ + Will try each of the given (charset, errors) items in turn, until one succeeds + """ + + self.encoding_list = encoding_list + + def decode (self, line) : + """ + Decode the line of str() text into an unicode object + """ + + # list of errors encountered + error_list = [] + + # try each in turn + for charset, errors in self.encoding_list : + # trap UnicodeDecodeError to try with the next one + try : + return line.decode(charset, errors) + + except UnicodeDecodeError, e : + error_list.append("%s:%s - %s" % (charset, errors, e)) + continue + + # failure + raise UnicodeDecodeError("Failed to decode line: %r: %s" % (line, ', '.join(error_list))) + +class LogSource (object) : + """ + A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events + """ + + def __init__ (self, decoder, channel=None) : + """ + The appropriate LogChannel must be given, as we need to be able to construct the LogLines. If it is not yet + known, then it can be given as None, and set later with bind_channel. + + Uses the given LogSourceDecoder to decode the lines. + """ + + self.channel = channel + self.decoder = decoder + + def bind_channel (self, channel) : + """ + Set this source's channel, where None was set before + """ + + assert not self.channel + + self.channel = channel + + def get_latest (self, count) : + """ + Yield the latest events, up to `count` of them. + """ + + abstract + + def get_date (self, dt) : + """ + Get logs for the given date (as a datetime). + """ + + abstract + + def get_date_paged (self, dt, count, page=None) : + """ + Get the logs for a given date (as a datetime), divided into pages of count each. If page is given, the time + portion of the dt is ignored, and the lines for the given page are returned. Otherwise, if page is None, + then the lines for the page containing the given timestamp is returned. + + The return value is a (page, max, lines) tuple. + """ + + # how to act? + if page : + # constant skip + skip = (page - 1) * count + + else : + skip = None + + # go through the logs a page at a time + this_page = 1 + + # last line's timestamp + last_ts = None + + # found it yet? + found = False + + # count the full number of lines + line_count = 0 + + # collect lines + lines = [] + + # iterate using get_date + for line in self.get_date(dt) : + # count them + line_count += 1 + + # skip? + if skip : + skip -= 1 + continue + + # is this page all that we want/need? + if page or found : + # already full? + if len(lines) >= count : + continue + + # specfic timestamp + else : + # didn't find it in this page? + if len(lines) >= count : + # reset to next page + lines = [] + this_page += 1 + + # is dt between these two timestamps? + if (not last_ts or last_ts <= dt) and (dt <= line.timestamp) : + # found! + found = True + page = this_page + + else : + # keep looking + last_ts = line.timestamp + + # store line + lines.append(line) + + # calculate max_pages + max_pages = math.ceil(float(line_count) / count) + + # return + return (page, max_pages, lines) + + def get_month_days (self, dt) : + """ + Return an ordered sequence of dates, telling which days in the given month (as a datetime) have logs available. + """ + + abstract + + def get_modified (self, dt=None, after=None, until=None) : + """ + Returns a sequence of LogLines that may have been *modified* from their old values since the given datetime. + + If the datetime is not given, *all* lines are returned. + + If after is given, only lines from said date onwards will be returned, regardless of modification. + If until is given, only lines up to and including said date will be returned, regardless of modification. + + The LogLines should be in time order. + """ + + abstract + + def get_prev_date (self, dt) : + """ + Get the next distinct date of logs available preceeding the given date, or None + """ + + abstract + + def get_next_date (self, dt) : + """ + Get the next distinct date of logs following the given date, or None. + """ + + abstract + +class LogFile (object) : + """ + A file containing LogEvents + + XXX: modify to implement LogSource? + """ + + def __init__ (self, path, parser, decoder, channel=None, start_date=None, sep='\n') : + """ + Open the file at the given path, which contains lines as separated by the given separator. Lines are + decoded using the given LogSourceDecoder, and then parsed using the given parser, using the given date + as the initial date for this log's first line. + + XXX: currently we assume start_date also for the end of the file + """ + + # store + self.channel = channel + self.path = path + self.parser = parser + self.start_date = start_date + self.decoder = decoder + self.sep = sep + + # open + self.file = open(path, 'rb') + + def __iter__ (self) : + """ + Yields a series of unicode lines, as read from the top of the file + """ + + # seek to beginning + self.file.seek(0) + + # iterate over lines, decoding them as well + return (self.decoder.decode(line.rstrip(self.sep)) for line in self.file) + + def read_full (self) : + """ + Reads all LogLines. The LogLines will have a valid offset. + """ + + # just use our __iter__ + return self.parser.parse_lines(self.channel, self, self.start_date, starting_offset=1) + + def read_from (self, dt) : + """ + Reads all LogLines from the given naive timestamp onwards + """ + + # start reading at beginning + events = self.read_full() + + # skip unwanted events + for event in events : + if event.timestamp < dt : + continue + + else : + # include this line as well + yield event + break + + # yield the rest as-is + for event in events : + yield event + + def read_until (self, dt) : + """ + Reads all LogLines up until the given naive timestamp + """ + + # start reading events at the beginning + events = self.read_full() + + # yield events until we hit the given timestamp + for event in events : + if event.timestamp <= dt : + yield event + + else : + break + + # ignore the rest + return + + def _read_blocks_reverse (self, blocksize=1024) : + """ + Yields blocks of file data in reverse order, starting at the end of the file + """ + + # seek to end of file + self.file.seek(0, os.SEEK_END) + + # read offset + # XXX: hack -1 to get rid of trailing newline + size = offset = self.file.tell() - 1 + + # do not try to read past the beginning of the file + while offset > 0: + # calc new offset + size + if offset > blocksize : + # full block + offset -= blocksize + read_size = blocksize + + else : + # partial block + read_size = offset + offset = 0 + + # seek to offset + self.file.seek(offset) + + # read the data we want + block = self.file.read(read_size) + + # sanity check + assert len(block) == read_size + + # yield + yield block + + def _read_lines_reverse (self) : + """ + Yields decoded lines from the end of the file, in reverse order. + """ + + # partial lines + buf = '' + + # read from end of file, a block at a time + for block in self._read_blocks_reverse() : + # add in our previous buf + buf = block + buf + + # split up lines + lines = buf.split(self.sep) + + # keep the first one as our buffer, as it's incomplete + buf = lines[0] + + # yield the rest a line at a time in reverse order... this looks weird, but that's how slicing works :) + # XXX: use something like islice, this has to build a slice object + for line in lines[:0:-1] : + yield self.decoder.decode(line) + + def read_latest (self, count) : + """ + Returns up to count events, from the end of the file, or less, if the file doesn't contain that many lines. + """ + + # the list of lines + lines = [] + + # start reading lines into lines + for line in self._read_lines_reverse() : + # append + lines.append(line) + + # done? + if len(lines) >= count : + break + + # decode in reverse order, using our starting date.... + # XXX: use lines[::-1] or reversed? + # XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that + return self.parser.parse_lines(self.channel, reversed(lines), self.start_date) + +class LogDirectory (LogSource) : + """ + A directory containing a series of timestamped LogFiles + """ + + def __init__ (self, path, tz, parser, decoder, filename_fmt, channel=None) : + """ + Load the logfiles at the given path, which are for the given LogChannel + + Decode the file lines using the given decoder, the files are named according the the date in the given + timezone and date format, and will be parsed using the given parser. + """ + + # store + self.channel = channel + self.path = path + self.tz = tz + self.parser = parser + self.decoder = decoder + self.filename_fmt = filename_fmt + + def _get_logfile_date (self, d, load=True, mtime=False, ignore_missing=False) : + """ + Get the logfile corresponding to the given naive date in our timezone. + + If load is False, only test for the presence of the logfile, do not actually open it. If mtime is given, + then this returns the file's mtime + + Returns None if the logfile does not exist, unless ignore_missing is given as False. + """ + + # format filename + filename = d.strftime(self.filename_fmt) + + # build path + path = os.path.join(self.path, filename) + + try : + if load : + # open+return the LogFile + return LogFile(path, self.parser, self.decoder, start_date=d, channel=self.channel) + + elif mtime : + # stat + return utils.mtime(path) + + else : + # test + return os.path.exists(path) + + # XXX: move to LogFile + except IOError, e : + # return None for missing files + if e.errno == errno.ENOENT and ignore_missing : + return None + + else : + raise + + def _iter_logfile_dates (self, after=None, until=None, reverse=False) : + """ + Yields a series of naive datetime objects representing the logfiles that are available, in time order. + + Parameters : + after only dates from said date onwards will be returned + until only dates up to and including said date will be returned + reverse the dates are returned in reverse order instead. Note that the meaning of after/until doesn't change + """ + + # convert timestamps to our timezone's dates + if after : + after = after.astimezone(self.tz).date() + + if until : + until = until.astimezone(self.tz).date() + + # listdir + filenames = os.listdir(self.path) + + # sort + filenames.sort(reverse=reverse) + + # iter files + for filename in filenames : + try : + # parse date + dt = self.tz.localize(datetime.datetime.strptime(filename, self.filename_fmt)) + date = dt.date() + + except : + # ignore + continue + + else : + if (after and date < after) or (until and date > until) : + # ignore + continue + + else : + # yield + yield dt + + def _iter_date_reverse (self, dt=None) : + """ + Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the + given *datetime*, or the the current date, if none given + """ + + # default to now + if not dt : + dtz = self.tz.localize(datetime.datetime.now()) + + else : + # convert to target timezone + dtz = dt.astimezone(self.tz) + + # iterate unto infinity + while True : + # yield + yield dtz.date() + + # one day sdrawkcab + dtz -= ONE_DAY + + def _iter_logfile_reverse (self, dt=None, max_files=100) : + """ + Yields a series of LogFile objects, iterating backwards in time starting at the given datetime, or the + current date, if none given. + + Reads/probes at most max_files files. + """ + + # start counting at zero... + file_count = 0 + + # have we found any files at all so far? + have_found = False + + # iterate backwards over days + for day in self._iter_date_reverse(dt) : + # stop if we've handled enough files by now + if file_count > max_files : + break + + # try and open the next logfile + logfile = None + + file_count += 1 + logfile = self._get_logfile_date(day, ignore_missing=True) + + # no logfile there? + if not logfile : + # hit our limit? + if file_count > max_files : + # if we didn't find any logfiles at all, terminate rudely + if not have_found : + raise Exception("No recent logfiles found") + + else : + # stop looking, deal with what we've got + return + + else : + # skip to next day + continue + + # mark have_found + have_found = True + + # yield it + yield logfile + + def get_latest (self, count) : + """ + Uses _logfile_reverse to read the yield the given lines from as many logfiles as needed + """ + + # read the events into here + lines = [] + + # start reading in those logfiles + for logfile in self._iter_logfile_reverse() : + # read the events + # XXX: use a queue + lines = list(logfile.read_latest(count)) + lines + + # done? + if len(lines) >= count : + break + + # return the events + return lines + + def get_date (self, dt) : + """ + A 'day' is considered to be a 24-hour period from 00:00:00 23:59:59. If the timezone of the given datetime + differs from our native datetime, this may involve lines from more than one logfile. + """ + + # begin/end of 24h period, in target timezone + dtz_begin = dt.replace(hour=0, minute=0, second=0).astimezone(self.tz) + dtz_end = dt.replace(hour=23, minute=59, second=59, microsecond=999999).astimezone(self.tz) + + # as dates + d_begin = dtz_begin.date() + d_end = dtz_end.date() + +# print +# print "LogDirectory.get_date - %s" % dt +# print "\t %s %s" % (d_begin, dtz_begin) +# print "\t-> %s %s" % (d_end, dtz_end) + + # if they're the same, just pull the full log for that date + if d_begin == d_end : + # open that log + logfile = self._get_logfile_date(d_begin) + + # return the full data + return logfile.read_full() + + # otherwise, we need to pull two partial logs + else : + # open both of them, but it's okay if we don't have the second one + f_begin = self._get_logfile_date(d_begin) + f_end = self._get_logfile_date(d_end, ignore_missing=True) + + # chain together the two sources + return itertools.chain( + f_begin.read_from(dtz_begin), + f_end.read_until(dtz_end) if f_end else [] + ) + + def _iter_month_days (self, month) : + """ + Iterates over the days of a month as dt objects with time=0 + """ + + # there's at most 31 days in a month... + for day in xrange(1, 32) : + try : + # try and build the datetime + dt = datetime.datetime(month.year, month.month, day) + + except : + # stop + return + + else : + # fix timezones + yield + yield month.tzinfo.localize(dt) + + def get_month_days (self, month) : + """ + Returns a set of dates for which logfiles are available in the given datetime's month + """ + + # iterate over month's days + for dt in self._iter_month_days(month) : + # date in our target timezone + log_date = dt.astimezone(self.tz).date() + + # test for it + if self._get_logfile_date(log_date, load=False, ignore_missing=True) : + # valid + yield dt.date() + + def get_modified (self, dt=None, after=None, until=None) : + """ + Returns the contents off all logfiles with mtimes past the given date + """ + + # iterate through all available logfiles in date order, as datetimes, from the given date on + for log_date in self._iter_logfile_dates(after, until) : + # compare against dt? + if dt : + # stat + mtime = self._get_logfile_date(log_date, load=False, mtime=True, ignore_missing=True) + + # not modified? + if mtime < dt : + # skip + continue + + # open + logfile = self._get_logfile_date(log_date) + + # yield all lines + for line in logfile.read_full() : + yield line + + def get_prev_date (self, dt) : + """ + Just use _iter_logfile_dates + """ + + # use for to "iter" once + for log_date in self._iter_logfile_dates(until=dt - ONE_DAY, reverse=True) : + return log_date + + else : + return None + + def get_next_date (self, dt) : + """ + Just use _iter_logfile_dates + """ + + # use for to "iter" once + for log_date in self._iter_logfile_dates(after=dt + ONE_DAY) : + return log_date + + else : + return None +