terom@41: """ terom@41: A source of IRC log files terom@41: """ terom@41: terom@77: import datetime, calendar, itertools, functools, math terom@93: import os, os.path, errno terom@41: import pytz terom@41: terom@93: import config, utils terom@82: terom@111: # a timedelta that represents one day terom@111: ONE_DAY = datetime.timedelta(days=1) terom@111: terom@82: class LogSourceDecoder (object) : terom@82: """ terom@82: Handles decoding of LogSource lines terom@82: """ terom@82: terom@82: def __init__ (self, encoding_list) : terom@82: """ terom@82: Will try each of the given (charset, errors) items in turn, until one succeeds terom@82: """ terom@82: terom@82: self.encoding_list = encoding_list terom@82: terom@82: def decode (self, line) : terom@82: """ terom@82: Decode the line of str() text into an unicode object terom@82: """ terom@82: terom@82: # list of errors encountered terom@82: error_list = [] terom@82: terom@82: # try each in turn terom@82: for charset, errors in self.encoding_list : terom@82: # trap UnicodeDecodeError to try with the next one terom@82: try : terom@82: return line.decode(charset, errors) terom@82: terom@82: except UnicodeDecodeError, e : terom@82: error_list.append("%s:%s - %s" % (charset, errors, e)) terom@82: continue terom@82: terom@82: # failure terom@82: raise UnicodeDecodeError("Failed to decode line: %r: %s" % (line, ', '.join(error_list))) terom@82: terom@41: class LogSource (object) : terom@41: """ terom@41: A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events terom@41: """ terom@41: terom@86: def __init__ (self, decoder, channel=None) : terom@82: """ terom@86: The appropriate LogChannel must be given, as we need to be able to construct the LogLines. If it is not yet terom@86: known, then it can be given as None, and set later with bind_channel. terom@86: terom@86: Uses the given LogSourceDecoder to decode the lines. terom@86: """ terom@86: terom@86: self.channel = channel terom@86: self.decoder = decoder terom@86: terom@86: def bind_channel (self, channel) : terom@86: """ terom@86: Set this source's channel, where None was set before terom@82: """ terom@82: terom@86: assert not self.channel terom@86: terom@86: self.channel = channel terom@86: terom@41: def get_latest (self, count) : terom@41: """ terom@41: Yield the latest events, up to `count` of them. terom@41: """ terom@41: terom@41: abstract terom@50: terom@50: def get_date (self, dt) : terom@50: """ terom@76: Get logs for the given date (as a datetime). terom@50: """ terom@50: terom@50: abstract terom@54: terom@76: def get_date_paged (self, dt, count, page=None) : terom@76: """ terom@76: Get the logs for a given date (as a datetime), divided into pages of count each. If page is given, the time terom@76: portion of the dt is ignored, and the lines for the given page are returned. Otherwise, if page is None, terom@76: then the lines for the page containing the given timestamp is returned. terom@76: terom@76: The return value is a (page, max, lines) tuple. terom@76: """ terom@76: terom@76: # how to act? terom@76: if page : terom@76: # constant skip terom@76: skip = (page - 1) * count terom@76: terom@76: else : terom@76: skip = None terom@76: terom@78: # go through the logs a page at a time terom@78: this_page = 1 terom@78: terom@78: # last line's timestamp terom@78: last_ts = None terom@78: terom@78: # found it yet? terom@78: found = False terom@78: terom@77: # count the full number of lines terom@77: line_count = 0 terom@77: terom@76: # collect lines terom@76: lines = [] terom@76: terom@76: # iterate using get_date terom@76: for line in self.get_date(dt) : terom@77: # count them terom@77: line_count += 1 terom@77: terom@76: # skip? terom@76: if skip : terom@76: skip -= 1 terom@76: continue terom@78: terom@78: # is this page all that we want/need? terom@78: if page or found : terom@78: # already full? terom@78: if len(lines) >= count : terom@78: continue terom@78: terom@78: # specfic timestamp terom@78: else : terom@78: # didn't find it in this page? terom@78: if len(lines) >= count : terom@78: # reset to next page terom@78: lines = [] terom@78: this_page += 1 terom@78: terom@78: # is dt between these two timestamps? terom@78: if (not last_ts or last_ts <= dt) and (dt <= line.timestamp) : terom@78: # found! terom@78: found = True terom@78: page = this_page terom@78: terom@78: else : terom@78: # keep looking terom@78: last_ts = line.timestamp terom@78: terom@76: # store line terom@76: lines.append(line) terom@77: terom@77: # calculate max_pages terom@77: max_pages = math.ceil(float(line_count) / count) terom@77: terom@77: # return terom@77: return (page, max_pages, lines) terom@76: terom@54: def get_month_days (self, dt) : terom@54: """ terom@111: Return an ordered sequence of dates, telling which days in the given month (as a datetime) have logs available. terom@54: """ terom@41: terom@54: abstract terom@93: terom@103: def get_modified (self, dt=None, after=None, until=None) : terom@93: """ terom@93: Returns a sequence of LogLines that may have been *modified* from their old values since the given datetime. terom@93: terom@94: If the datetime is not given, *all* lines are returned. terom@94: terom@106: If after is given, only lines from said date onwards will be returned, regardless of modification. terom@106: If until is given, only lines up to and including said date will be returned, regardless of modification. terom@93: terom@93: The LogLines should be in time order. terom@93: """ terom@93: terom@93: abstract terom@111: terom@111: def get_prev_date (self, dt) : terom@111: """ terom@111: Get the next distinct date of logs available preceeding the given date, or None terom@111: """ terom@111: terom@111: abstract terom@111: terom@111: def get_next_date (self, dt) : terom@111: """ terom@111: Get the next distinct date of logs following the given date, or None. terom@111: """ terom@111: terom@111: abstract terom@82: terom@54: class LogFile (object) : terom@41: """ terom@41: A file containing LogEvents terom@54: terom@54: XXX: modify to implement LogSource? terom@41: """ terom@41: terom@86: def __init__ (self, path, parser, decoder, channel=None, start_date=None, sep='\n') : terom@41: """ terom@82: Open the file at the given path, which contains lines as separated by the given separator. Lines are terom@82: decoded using the given LogSourceDecoder, and then parsed using the given parser, using the given date terom@82: as the initial date for this log's first line. terom@82: terom@82: XXX: currently we assume start_date also for the end of the file terom@41: """ terom@41: terom@41: # store terom@86: self.channel = channel terom@41: self.path = path terom@50: self.parser = parser terom@50: self.start_date = start_date terom@82: self.decoder = decoder terom@41: self.sep = sep terom@41: terom@41: # open terom@48: self.file = open(path, 'rb') terom@81: terom@41: def __iter__ (self) : terom@41: """ terom@50: Yields a series of unicode lines, as read from the top of the file terom@41: """ terom@41: terom@41: # seek to beginning terom@41: self.file.seek(0) terom@41: terom@50: # iterate over lines, decoding them as well terom@82: return (self.decoder.decode(line.rstrip(self.sep)) for line in self.file) terom@41: terom@50: def read_full (self) : terom@41: """ terom@76: Reads all LogLines. The LogLines will have a valid offset. terom@50: """ terom@50: terom@50: # just use our __iter__ terom@86: return self.parser.parse_lines(self.channel, self, self.start_date, starting_offset=1) terom@50: terom@50: def read_from (self, dt) : terom@50: """ terom@50: Reads all LogLines from the given naive timestamp onwards terom@50: """ terom@50: terom@50: # start reading at beginning terom@50: events = self.read_full() terom@50: terom@50: # skip unwanted events terom@50: for event in events : terom@50: if event.timestamp < dt : terom@50: continue terom@50: terom@50: else : terom@50: # include this line as well terom@50: yield event terom@50: break terom@50: terom@50: # yield the rest as-is terom@50: for event in events : terom@50: yield event terom@50: terom@50: def read_until (self, dt) : terom@50: """ terom@50: Reads all LogLines up until the given naive timestamp terom@41: """ terom@41: terom@50: # start reading events at the beginning terom@50: events = self.read_full() terom@50: terom@50: # yield events until we hit the given timestamp terom@50: for event in events : terom@50: if event.timestamp <= dt : terom@50: yield event terom@50: terom@50: else : terom@50: break terom@50: terom@50: # ignore the rest terom@50: return terom@50: terom@50: def _read_blocks_reverse (self, blocksize=1024) : terom@50: """ terom@50: Yields blocks of file data in reverse order, starting at the end of the file terom@50: """ terom@41: terom@41: # seek to end of file terom@41: self.file.seek(0, os.SEEK_END) terom@41: terom@41: # read offset terom@48: # XXX: hack -1 to get rid of trailing newline terom@48: size = offset = self.file.tell() - 1 terom@50: terom@50: # do not try to read past the beginning of the file terom@50: while offset > 0: terom@48: # calc new offset + size terom@50: if offset > blocksize : terom@48: # full block terom@50: offset -= blocksize terom@50: read_size = blocksize terom@41: terom@48: else : terom@48: # partial block terom@48: read_size = offset terom@41: offset = 0 terom@41: terom@43: # seek to offset terom@41: self.file.seek(offset) terom@41: terom@48: # read the data we want terom@50: block = self.file.read(read_size) terom@41: terom@48: # sanity check terom@50: assert len(block) == read_size terom@41: terom@50: # yield terom@50: yield block terom@50: terom@50: def _read_lines_reverse (self) : terom@50: """ terom@50: Yields decoded lines from the end of the file, in reverse order. terom@50: """ terom@50: terom@50: # partial lines terom@50: buf = '' terom@50: terom@50: # read from end of file, a block at a time terom@50: for block in self._read_blocks_reverse() : terom@41: # add in our previous buf terom@50: buf = block + buf terom@41: terom@50: # split up lines terom@50: lines = buf.split(self.sep) terom@41: terom@41: # keep the first one as our buffer, as it's incomplete terom@50: buf = lines[0] terom@50: terom@50: # yield the rest a line at a time in reverse order... this looks weird, but that's how slicing works :) terom@50: # XXX: use something like islice, this has to build a slice object terom@50: for line in lines[:0:-1] : terom@83: yield self.decoder.decode(line) terom@41: terom@54: def read_latest (self, count) : terom@50: """ terom@50: Returns up to count events, from the end of the file, or less, if the file doesn't contain that many lines. terom@50: """ terom@50: terom@50: # the list of lines terom@50: lines = [] terom@50: terom@50: # start reading lines into lines terom@50: for line in self._read_lines_reverse() : terom@50: # append terom@50: lines.append(line) terom@50: terom@50: # done? terom@50: if len(lines) >= count : terom@50: break terom@48: terom@50: # decode in reverse order, using our starting date.... terom@50: # XXX: use lines[::-1] or reversed? terom@50: # XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that terom@86: return self.parser.parse_lines(self.channel, reversed(lines), self.start_date) terom@41: terom@41: class LogDirectory (LogSource) : terom@41: """ terom@41: A directory containing a series of timestamped LogFiles terom@41: """ terom@41: terom@86: def __init__ (self, path, tz, parser, decoder, filename_fmt, channel=None) : terom@41: """ terom@86: Load the logfiles at the given path, which are for the given LogChannel terom@41: terom@82: Decode the file lines using the given decoder, the files are named according the the date in the given terom@82: timezone and date format, and will be parsed using the given parser. terom@41: """ terom@41: terom@41: # store terom@86: self.channel = channel terom@41: self.path = path terom@41: self.tz = tz terom@50: self.parser = parser terom@82: self.decoder = decoder terom@41: self.filename_fmt = filename_fmt terom@41: terom@115: def _get_logfile_date (self, d, load=True, mtime=False, ignore_missing=False) : terom@41: """ terom@93: Get the logfile corresponding to the given naive date in our timezone. terom@93: terom@102: If load is False, only test for the presence of the logfile, do not actually open it. If mtime is given, terom@102: then this returns the file's mtime terom@54: terom@93: Returns None if the logfile does not exist, unless ignore_missing is given as False. terom@41: """ terom@41: terom@41: # format filename terom@41: filename = d.strftime(self.filename_fmt) terom@41: terom@41: # build path terom@41: path = os.path.join(self.path, filename) terom@54: terom@54: try : terom@54: if load : terom@54: # open+return the LogFile terom@86: return LogFile(path, self.parser, self.decoder, start_date=d, channel=self.channel) terom@54: terom@102: elif mtime : terom@93: # stat terom@102: return utils.mtime(path) terom@93: terom@54: else : terom@54: # test terom@54: return os.path.exists(path) terom@41: terom@54: # XXX: move to LogFile terom@54: except IOError, e : terom@54: # return None for missing files terom@93: if e.errno == errno.ENOENT and ignore_missing : terom@54: return None terom@54: terom@54: else : terom@54: raise terom@41: terom@111: def _iter_logfile_dates (self, after=None, until=None, reverse=False) : terom@93: """ terom@94: Yields a series of naive datetime objects representing the logfiles that are available, in time order. terom@111: terom@111: Parameters : terom@111: after only dates from said date onwards will be returned terom@111: until only dates up to and including said date will be returned terom@111: reverse the dates are returned in reverse order instead. Note that the meaning of after/until doesn't change terom@93: """ terom@93: terom@115: # convert timestamps to our timezone's dates terom@115: if after : terom@115: after = after.astimezone(self.tz).date() terom@115: terom@115: if until : terom@115: until = until.astimezone(self.tz).date() terom@115: terom@93: # listdir terom@93: filenames = os.listdir(self.path) terom@93: terom@93: # sort terom@111: filenames.sort(reverse=reverse) terom@106: terom@93: # iter files terom@93: for filename in filenames : terom@93: try : terom@94: # parse date terom@115: dt = self.tz.localize(datetime.datetime.strptime(filename, self.filename_fmt)) terom@115: date = dt.date() terom@93: terom@93: except : terom@93: # ignore terom@93: continue terom@94: terom@94: else : terom@111: if (after and date < after) or (until and date > until) : terom@111: # ignore terom@111: continue terom@111: terom@111: else : terom@94: # yield terom@115: yield dt terom@93: terom@50: def _iter_date_reverse (self, dt=None) : terom@41: """ terom@41: Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the terom@41: given *datetime*, or the the current date, if none given terom@41: """ terom@41: terom@41: # default to now terom@41: if not dt : terom@81: dtz = self.tz.localize(datetime.datetime.now()) terom@81: terom@81: else : terom@81: # convert to target timezone terom@81: dtz = dt.astimezone(self.tz) terom@41: terom@41: # iterate unto infinity terom@41: while True : terom@41: # yield terom@41: yield dtz.date() terom@41: terom@41: # one day sdrawkcab terom@41: dtz -= ONE_DAY terom@41: terom@63: def _iter_logfile_reverse (self, dt=None, max_files=100) : terom@41: """ terom@63: Yields a series of LogFile objects, iterating backwards in time starting at the given datetime, or the terom@63: current date, if none given. terom@63: terom@63: Reads/probes at most max_files files. terom@41: """ terom@41: terom@63: # start counting at zero... terom@63: file_count = 0 terom@48: terom@81: # have we found any files at all so far? terom@81: have_found = False terom@81: terom@63: # iterate backwards over days terom@63: for day in self._iter_date_reverse(dt) : terom@63: # stop if we've handled enough files by now terom@63: if file_count > max_files : terom@63: break terom@63: terom@63: # try and open the next logfile terom@41: logfile = None terom@41: terom@63: file_count += 1 terom@115: logfile = self._get_logfile_date(day, ignore_missing=True) terom@54: terom@63: # no logfile there? terom@54: if not logfile : terom@81: # hit our limit? terom@63: if file_count > max_files : terom@81: # if we didn't find any logfiles at all, terminate rudely terom@81: if not have_found : terom@81: raise Exception("No recent logfiles found") terom@81: terom@81: else : terom@81: # stop looking, deal with what we've got terom@81: return terom@81: terom@41: else : terom@41: # skip to next day terom@41: continue terom@48: terom@81: # mark have_found terom@81: have_found = True terom@81: terom@63: # yield it terom@63: yield logfile terom@63: terom@63: def get_latest (self, count) : terom@63: """ terom@111: Uses _logfile_reverse to read the yield the given lines from as many logfiles as needed terom@63: """ terom@63: terom@63: # read the events into here terom@63: lines = [] terom@63: terom@81: # start reading in those logfiles terom@81: for logfile in self._iter_logfile_reverse() : terom@50: # read the events terom@50: # XXX: use a queue terom@54: lines = list(logfile.read_latest(count)) + lines terom@81: terom@81: # done? terom@81: if len(lines) >= count : terom@81: break terom@48: terom@50: # return the events terom@48: return lines terom@41: terom@50: def get_date (self, dt) : terom@50: """ terom@50: A 'day' is considered to be a 24-hour period from 00:00:00 23:59:59. If the timezone of the given datetime terom@50: differs from our native datetime, this may involve lines from more than one logfile. terom@50: """ terom@50: terom@50: # begin/end of 24h period, in target timezone terom@50: dtz_begin = dt.replace(hour=0, minute=0, second=0).astimezone(self.tz) terom@50: dtz_end = dt.replace(hour=23, minute=59, second=59, microsecond=999999).astimezone(self.tz) terom@50: terom@50: # as dates terom@50: d_begin = dtz_begin.date() terom@50: d_end = dtz_end.date() terom@72: terom@72: # print terom@72: # print "LogDirectory.get_date - %s" % dt terom@72: # print "\t %s %s" % (d_begin, dtz_begin) terom@72: # print "\t-> %s %s" % (d_end, dtz_end) terom@50: terom@50: # if they're the same, just pull the full log for that date terom@50: if d_begin == d_end : terom@64: # open that log terom@64: logfile = self._get_logfile_date(d_begin) terom@64: terom@64: # return the full data terom@64: return logfile.read_full() terom@50: terom@50: # otherwise, we need to pull two partial logs terom@50: else : terom@115: # open both of them, but it's okay if we don't have the second one terom@50: f_begin = self._get_logfile_date(d_begin) terom@115: f_end = self._get_logfile_date(d_end, ignore_missing=True) terom@115: terom@50: # chain together the two sources terom@55: return itertools.chain( terom@55: f_begin.read_from(dtz_begin), terom@55: f_end.read_until(dtz_end) if f_end else [] terom@55: ) terom@83: terom@83: def _iter_month_days (self, month) : terom@83: """ terom@83: Iterates over the days of a month as dt objects with time=0 terom@83: """ terom@83: terom@83: # there's at most 31 days in a month... terom@83: for day in xrange(1, 32) : terom@83: try : terom@83: # try and build the datetime terom@83: dt = datetime.datetime(month.year, month.month, day) terom@83: terom@83: except : terom@83: # stop terom@83: return terom@83: terom@83: else : terom@83: # fix timezones + yield terom@83: yield month.tzinfo.localize(dt) terom@50: terom@54: def get_month_days (self, month) : terom@54: """ terom@54: Returns a set of dates for which logfiles are available in the given datetime's month terom@54: """ terom@112: terom@83: # iterate over month's days terom@83: for dt in self._iter_month_days(month) : terom@54: # date in our target timezone terom@83: log_date = dt.astimezone(self.tz).date() terom@54: terom@54: # test for it terom@115: if self._get_logfile_date(log_date, load=False, ignore_missing=True) : terom@83: # valid terom@83: yield dt.date() terom@54: terom@103: def get_modified (self, dt=None, after=None, until=None) : terom@93: """ terom@93: Returns the contents off all logfiles with mtimes past the given date terom@93: """ terom@94: terom@94: # iterate through all available logfiles in date order, as datetimes, from the given date on terom@103: for log_date in self._iter_logfile_dates(after, until) : terom@93: # compare against dt? terom@93: if dt : terom@93: # stat terom@115: mtime = self._get_logfile_date(log_date, load=False, mtime=True, ignore_missing=True) terom@102: terom@93: # not modified? terom@102: if mtime < dt : terom@93: # skip terom@93: continue terom@93: terom@93: # open terom@115: logfile = self._get_logfile_date(log_date) terom@93: terom@93: # yield all lines terom@93: for line in logfile.read_full() : terom@93: yield line terom@93: terom@111: def get_prev_date (self, dt) : terom@111: """ terom@111: Just use _iter_logfile_dates terom@111: """ terom@111: terom@111: # use for to "iter" once terom@111: for log_date in self._iter_logfile_dates(until=dt - ONE_DAY, reverse=True) : terom@111: return log_date terom@111: terom@111: else : terom@111: return None terom@111: terom@111: def get_next_date (self, dt) : terom@111: """ terom@111: Just use _iter_logfile_dates terom@111: """ terom@111: terom@111: # use for to "iter" once terom@111: for log_date in self._iter_logfile_dates(after=dt + ONE_DAY) : terom@111: return log_date terom@111: terom@111: else : terom@111: return None terom@111: