terom@41: """ terom@41: A source of IRC log files terom@41: """ terom@41: terom@54: import datetime, calendar, itertools terom@50: import os, errno terom@41: import pytz terom@41: terom@41: class LogSource (object) : terom@41: """ terom@41: A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events terom@41: """ terom@41: terom@41: def get_latest (self, count) : terom@41: """ terom@41: Yield the latest events, up to `count` of them. terom@41: """ terom@41: terom@41: abstract terom@50: terom@50: def get_date (self, dt) : terom@50: """ terom@50: Get logs for the given date (as a datetime) terom@50: """ terom@50: terom@50: abstract terom@54: terom@54: def get_month_days (self, dt) : terom@54: """ terom@54: Get a set of dates, telling which days in the given month (as a datetime) have logs available terom@54: """ terom@41: terom@54: abstract terom@63: terom@54: class LogFile (object) : terom@41: """ terom@41: A file containing LogEvents terom@54: terom@54: XXX: modify to implement LogSource? terom@41: """ terom@41: terom@50: def __init__ (self, path, parser, start_date=None, charset='utf-8', sep='\n') : terom@41: """ terom@50: Open the file at the given path, which contains data with the given charset, as lines separated by the terom@50: given separator. Lines are parsed using the given parser, using the given date as an initial date, see terom@50: LogParser for more info. XXX: currently we assume start_date also for the end of the file terom@41: """ terom@41: terom@41: # store terom@41: self.path = path terom@50: self.parser = parser terom@50: self.start_date = start_date terom@41: self.charset = charset terom@41: self.sep = sep terom@41: terom@41: # open terom@48: self.file = open(path, 'rb') terom@41: terom@41: def __iter__ (self) : terom@41: """ terom@50: Yields a series of unicode lines, as read from the top of the file terom@41: """ terom@41: terom@41: # seek to beginning terom@41: self.file.seek(0) terom@41: terom@50: # iterate over lines, decoding them as well terom@65: return (line.decode(self.charset).rstrip(self.sep) for line in self.file) terom@41: terom@50: def read_full (self) : terom@41: """ terom@64: Reads all LogLines. The LogLines will have a valid offset terom@50: """ terom@50: terom@50: # just use our __iter__ terom@64: return self.parser.parse_lines(self, self.start_date, starting_offset=1) terom@50: terom@50: def read_from (self, dt) : terom@50: """ terom@50: Reads all LogLines from the given naive timestamp onwards terom@50: """ terom@50: terom@50: # start reading at beginning terom@50: events = self.read_full() terom@50: terom@50: # skip unwanted events terom@50: for event in events : terom@50: if event.timestamp < dt : terom@50: continue terom@50: terom@50: else : terom@50: # include this line as well terom@50: yield event terom@50: break terom@50: terom@50: # yield the rest as-is terom@50: for event in events : terom@50: yield event terom@50: terom@50: def read_until (self, dt) : terom@50: """ terom@50: Reads all LogLines up until the given naive timestamp terom@41: """ terom@41: terom@50: # start reading events at the beginning terom@50: events = self.read_full() terom@50: terom@50: # yield events until we hit the given timestamp terom@50: for event in events : terom@50: if event.timestamp <= dt : terom@50: yield event terom@50: terom@50: else : terom@50: break terom@50: terom@50: # ignore the rest terom@50: return terom@50: terom@50: def _read_blocks_reverse (self, blocksize=1024) : terom@50: """ terom@50: Yields blocks of file data in reverse order, starting at the end of the file terom@50: """ terom@41: terom@41: # seek to end of file terom@41: self.file.seek(0, os.SEEK_END) terom@41: terom@41: # read offset terom@48: # XXX: hack -1 to get rid of trailing newline terom@48: size = offset = self.file.tell() - 1 terom@50: terom@50: # do not try to read past the beginning of the file terom@50: while offset > 0: terom@48: # calc new offset + size terom@50: if offset > blocksize : terom@48: # full block terom@50: offset -= blocksize terom@50: read_size = blocksize terom@41: terom@48: else : terom@48: # partial block terom@48: read_size = offset terom@41: offset = 0 terom@41: terom@43: # seek to offset terom@41: self.file.seek(offset) terom@41: terom@48: # read the data we want terom@50: block = self.file.read(read_size) terom@41: terom@48: # sanity check terom@50: assert len(block) == read_size terom@41: terom@50: # yield terom@50: yield block terom@50: terom@50: def _read_lines_reverse (self) : terom@50: """ terom@50: Yields decoded lines from the end of the file, in reverse order. terom@50: """ terom@50: terom@50: # partial lines terom@50: buf = '' terom@50: terom@50: # read from end of file, a block at a time terom@50: for block in self._read_blocks_reverse() : terom@41: # add in our previous buf terom@50: buf = block + buf terom@41: terom@50: # split up lines terom@50: lines = buf.split(self.sep) terom@41: terom@41: # keep the first one as our buffer, as it's incomplete terom@50: buf = lines[0] terom@50: terom@50: # yield the rest a line at a time in reverse order... this looks weird, but that's how slicing works :) terom@50: # XXX: use something like islice, this has to build a slice object terom@50: for line in lines[:0:-1] : terom@50: yield line.decode(self.charset) terom@41: terom@54: def read_latest (self, count) : terom@50: """ terom@50: Returns up to count events, from the end of the file, or less, if the file doesn't contain that many lines. terom@50: """ terom@50: terom@50: # the list of lines terom@50: lines = [] terom@50: terom@50: # start reading lines into lines terom@50: for line in self._read_lines_reverse() : terom@50: # append terom@50: lines.append(line) terom@50: terom@50: # done? terom@50: if len(lines) >= count : terom@50: break terom@48: terom@50: # decode in reverse order, using our starting date.... terom@50: # XXX: use lines[::-1] or reversed? terom@50: # XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that terom@50: return self.parser.parse_lines(reversed(lines), self.start_date) terom@41: terom@41: class LogDirectory (LogSource) : terom@41: """ terom@41: A directory containing a series of timestamped LogFiles terom@41: """ terom@41: terom@50: def __init__ (self, path, tz, parser, charset='utf-8', filename_fmt='%Y-%m-%d') : terom@41: """ terom@41: Load the logfiles at the given path. terom@41: terom@41: The files contain data in the given charset, and are named according the the date in the given timezone and terom@50: date format, and will be parsed using the given parser. terom@41: """ terom@41: terom@41: # store terom@41: self.path = path terom@41: self.tz = tz terom@50: self.parser = parser terom@41: self.charset = charset terom@41: self.filename_fmt = filename_fmt terom@41: terom@41: def _get_logfile_datetime (self, dt) : terom@41: """ terom@41: Get the logfile corresponding to the given datetime terom@41: """ terom@41: terom@41: # convert to target timezone terom@41: dtz = dt.astimezone(self.tz) terom@41: terom@41: # convert to date and use that terom@41: return self._get_logfile_date(dtz.date()) terom@41: terom@54: def _get_logfile_date (self, d, load=True) : terom@41: """ terom@54: Get the logfile corresponding to the given naive date in our timezone. If load is False, only test for the terom@54: presence of the logfile, do not actually open it. terom@54: terom@54: Returns None if the logfile does not exist. terom@41: """ terom@41: terom@41: # format filename terom@41: filename = d.strftime(self.filename_fmt) terom@41: terom@41: # build path terom@41: path = os.path.join(self.path, filename) terom@54: terom@54: try : terom@54: if load : terom@54: # open+return the LogFile terom@54: return LogFile(path, self.parser, d, self.charset) terom@54: terom@54: else : terom@54: # test terom@54: return os.path.exists(path) terom@41: terom@54: # XXX: move to LogFile terom@54: except IOError, e : terom@54: # return None for missing files terom@54: if e.errno == errno.ENOENT : terom@54: return None terom@54: terom@54: else : terom@54: raise terom@41: terom@50: def _iter_date_reverse (self, dt=None) : terom@41: """ terom@41: Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the terom@41: given *datetime*, or the the current date, if none given terom@41: """ terom@41: terom@41: # default to now terom@41: if not dt : terom@50: dt = datetime.datetime.now(pytz.utc) terom@41: terom@41: # convert to target timezone terom@41: dtz = dt.astimezone(self.tz) terom@41: terom@41: # our timedelta terom@50: ONE_DAY = datetime.timedelta(1) terom@41: terom@41: # iterate unto infinity terom@41: while True : terom@41: # yield terom@41: yield dtz.date() terom@41: terom@41: # one day sdrawkcab terom@41: dtz -= ONE_DAY terom@41: terom@63: def _iter_logfile_reverse (self, dt=None, max_files=100) : terom@41: """ terom@63: Yields a series of LogFile objects, iterating backwards in time starting at the given datetime, or the terom@63: current date, if none given. terom@63: terom@63: Reads/probes at most max_files files. terom@41: """ terom@41: terom@63: # start counting at zero... terom@63: file_count = 0 terom@48: terom@63: # iterate backwards over days terom@63: for day in self._iter_date_reverse(dt) : terom@63: # stop if we've handled enough files by now terom@63: if file_count > max_files : terom@63: break terom@63: terom@63: # try and open the next logfile terom@41: logfile = None terom@41: terom@63: file_count += 1 terom@63: logfile = self._get_logfile_date(day) terom@54: terom@63: # no logfile there? terom@54: if not logfile : terom@63: # if we didn't find any logfiles at all, terminate rudely terom@63: if file_count > max_files : terom@41: raise Exception("No recent logfiles found") terom@41: terom@41: else : terom@41: # skip to next day terom@41: continue terom@48: terom@63: # yield it terom@63: yield logfile terom@63: terom@63: def get_latest (self, count) : terom@63: """ terom@63: Uses _iter_backwards + _get_logfile_date to read the yield the given lines from as many logfiles as needed terom@63: """ terom@63: terom@63: # iterate over logfiles terom@63: iter = self._iter_logfile_reverse() terom@63: terom@63: # read the events into here terom@63: lines = [] terom@63: terom@63: # loop until done terom@63: while len(lines) < count : terom@63: # next logfile terom@63: logfile = iter.next() terom@63: terom@50: # read the events terom@50: # XXX: use a queue terom@54: lines = list(logfile.read_latest(count)) + lines terom@48: terom@50: # return the events terom@48: return lines terom@41: terom@50: def get_date (self, dt) : terom@50: """ terom@50: A 'day' is considered to be a 24-hour period from 00:00:00 23:59:59. If the timezone of the given datetime terom@50: differs from our native datetime, this may involve lines from more than one logfile. terom@50: """ terom@50: terom@50: # begin/end of 24h period, in target timezone terom@50: dtz_begin = dt.replace(hour=0, minute=0, second=0).astimezone(self.tz) terom@50: dtz_end = dt.replace(hour=23, minute=59, second=59, microsecond=999999).astimezone(self.tz) terom@50: terom@50: # as dates terom@50: d_begin = dtz_begin.date() terom@50: d_end = dtz_end.date() terom@72: terom@72: # print terom@72: # print "LogDirectory.get_date - %s" % dt terom@72: # print "\t %s %s" % (d_begin, dtz_begin) terom@72: # print "\t-> %s %s" % (d_end, dtz_end) terom@50: terom@50: # if they're the same, just pull the full log for that date terom@50: if d_begin == d_end : terom@64: # open that log terom@64: logfile = self._get_logfile_date(d_begin) terom@64: terom@64: if not logfile : terom@64: raise Exception("No logfile for date=%r" % (dt, )) terom@64: terom@64: # return the full data terom@64: return logfile.read_full() terom@50: terom@50: # otherwise, we need to pull two partial logs terom@50: else : terom@50: # open both of them terom@50: f_begin = self._get_logfile_date(d_begin) terom@50: f_end = self._get_logfile_date(d_end) terom@55: terom@50: # chain together the two sources terom@55: return itertools.chain( terom@55: f_begin.read_from(dtz_begin), terom@55: f_end.read_until(dtz_end) if f_end else [] terom@55: ) terom@50: terom@54: def get_month_days (self, month) : terom@54: """ terom@54: Returns a set of dates for which logfiles are available in the given datetime's month terom@54: """ terom@54: terom@54: # the set of days terom@54: days = set() terom@54: terom@54: # iterate over month's days using Calendar terom@54: for date in calendar.Calendar().itermonthdates(month.year, month.month) : terom@54: # convert date to target datetime terom@54: dtz = month.tzinfo.localize(datetime.datetime.combine(date, datetime.time(0))).astimezone(self.tz) terom@54: terom@54: # date in our target timezone terom@54: log_date = dtz.date() terom@54: terom@54: # test for it terom@54: if self._get_logfile_date(log_date, load=False) : terom@54: # add to set terom@54: days.add(date) terom@54: terom@54: # return set terom@54: return days terom@54: