--- a/log_source.py Sun Sep 13 00:49:55 2009 +0300
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,679 +0,0 @@
-"""
- A source of IRC log files
-"""
-
-import datetime, calendar, itertools, functools, math
-import os, os.path, errno
-import pytz
-
-import config, utils
-
-# a timedelta that represents one day
-ONE_DAY = datetime.timedelta(days=1)
-
-class LogSourceDecoder (object) :
- """
- Handles decoding of LogSource lines
- """
-
- def __init__ (self, encoding_list) :
- """
- Will try each of the given (charset, errors) items in turn, until one succeeds
- """
-
- self.encoding_list = encoding_list
-
- def decode (self, line) :
- """
- Decode the line of str() text into an unicode object
- """
-
- # list of errors encountered
- error_list = []
-
- # try each in turn
- for charset, errors in self.encoding_list :
- # trap UnicodeDecodeError to try with the next one
- try :
- return line.decode(charset, errors)
-
- except UnicodeDecodeError, e :
- error_list.append("%s:%s - %s" % (charset, errors, e))
- continue
-
- # failure
- raise UnicodeDecodeError("Failed to decode line: %r: %s" % (line, ', '.join(error_list)))
-
-class LogSource (object) :
- """
- A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
- """
-
- def __init__ (self, decoder, channel=None) :
- """
- The appropriate LogChannel must be given, as we need to be able to construct the LogLines. If it is not yet
- known, then it can be given as None, and set later with bind_channel.
-
- Uses the given LogSourceDecoder to decode the lines.
- """
-
- self.channel = channel
- self.decoder = decoder
-
- def bind_channel (self, channel) :
- """
- Set this source's channel, where None was set before
- """
-
- assert not self.channel
-
- self.channel = channel
-
- def get_latest (self, count) :
- """
- Yield the latest events, up to `count` of them.
- """
-
- abstract
-
- def get_date (self, dt) :
- """
- Get logs for the given date (as a datetime).
- """
-
- abstract
-
- def get_date_paged (self, dt, count, page=None) :
- """
- Get the logs for a given date (as a datetime), divided into pages of count each. If page is given, the time
- portion of the dt is ignored, and the lines for the given page are returned. Otherwise, if page is None,
- then the lines for the page containing the given timestamp is returned.
-
- The return value is a (page, max, lines) tuple.
- """
-
- # how to act?
- if page :
- # constant skip
- skip = (page - 1) * count
-
- else :
- skip = None
-
- # go through the logs a page at a time
- this_page = 1
-
- # last line's timestamp
- last_ts = None
-
- # found it yet?
- found = False
-
- # count the full number of lines
- line_count = 0
-
- # collect lines
- lines = []
-
- # iterate using get_date
- for line in self.get_date(dt) :
- # count them
- line_count += 1
-
- # skip?
- if skip :
- skip -= 1
- continue
-
- # is this page all that we want/need?
- if page or found :
- # already full?
- if len(lines) >= count :
- continue
-
- # specfic timestamp
- else :
- # didn't find it in this page?
- if len(lines) >= count :
- # reset to next page
- lines = []
- this_page += 1
-
- # is dt between these two timestamps?
- if (not last_ts or last_ts <= dt) and (dt <= line.timestamp) :
- # found!
- found = True
- page = this_page
-
- else :
- # keep looking
- last_ts = line.timestamp
-
- # store line
- lines.append(line)
-
- # calculate max_pages
- max_pages = math.ceil(float(line_count) / count)
-
- # return
- return (page, max_pages, lines)
-
- def get_month_days (self, dt) :
- """
- Return an ordered sequence of dates, telling which days in the given month (as a datetime) have logs available.
- """
-
- abstract
-
- def get_modified (self, dt=None, after=None, until=None) :
- """
- Returns a sequence of LogLines that may have been *modified* from their old values since the given datetime.
-
- If the datetime is not given, *all* lines are returned.
-
- If after is given, only lines from said date onwards will be returned, regardless of modification.
- If until is given, only lines up to and including said date will be returned, regardless of modification.
-
- The LogLines should be in time order.
- """
-
- abstract
-
- def get_prev_date (self, dt) :
- """
- Get the next distinct date of logs available preceeding the given date, or None
- """
-
- abstract
-
- def get_next_date (self, dt) :
- """
- Get the next distinct date of logs following the given date, or None.
- """
-
- abstract
-
-class LogFile (object) :
- """
- A file containing LogEvents
-
- XXX: modify to implement LogSource?
- """
-
- def __init__ (self, path, parser, decoder, channel=None, start_date=None, sep='\n') :
- """
- Open the file at the given path, which contains lines as separated by the given separator. Lines are
- decoded using the given LogSourceDecoder, and then parsed using the given parser, using the given date
- as the initial date for this log's first line.
-
- XXX: currently we assume start_date also for the end of the file
- """
-
- # store
- self.channel = channel
- self.path = path
- self.parser = parser
- self.start_date = start_date
- self.decoder = decoder
- self.sep = sep
-
- # open
- self.file = open(path, 'rb')
-
- def __iter__ (self) :
- """
- Yields a series of unicode lines, as read from the top of the file
- """
-
- # seek to beginning
- self.file.seek(0)
-
- # iterate over lines, decoding them as well
- return (self.decoder.decode(line.rstrip(self.sep)) for line in self.file)
-
- def read_full (self) :
- """
- Reads all LogLines. The LogLines will have a valid offset.
- """
-
- # just use our __iter__
- return self.parser.parse_lines(self.channel, self, self.start_date, starting_offset=1)
-
- def read_from (self, dt) :
- """
- Reads all LogLines from the given naive timestamp onwards
- """
-
- # start reading at beginning
- events = self.read_full()
-
- # skip unwanted events
- for event in events :
- if event.timestamp < dt :
- continue
-
- else :
- # include this line as well
- yield event
- break
-
- # yield the rest as-is
- for event in events :
- yield event
-
- def read_until (self, dt) :
- """
- Reads all LogLines up until the given naive timestamp
- """
-
- # start reading events at the beginning
- events = self.read_full()
-
- # yield events until we hit the given timestamp
- for event in events :
- if event.timestamp <= dt :
- yield event
-
- else :
- break
-
- # ignore the rest
- return
-
- def _read_blocks_reverse (self, blocksize=1024) :
- """
- Yields blocks of file data in reverse order, starting at the end of the file
- """
-
- # seek to end of file
- self.file.seek(0, os.SEEK_END)
-
- # read offset
- # XXX: hack -1 to get rid of trailing newline
- size = offset = self.file.tell() - 1
-
- # do not try to read past the beginning of the file
- while offset > 0:
- # calc new offset + size
- if offset > blocksize :
- # full block
- offset -= blocksize
- read_size = blocksize
-
- else :
- # partial block
- read_size = offset
- offset = 0
-
- # seek to offset
- self.file.seek(offset)
-
- # read the data we want
- block = self.file.read(read_size)
-
- # sanity check
- assert len(block) == read_size
-
- # yield
- yield block
-
- def _read_lines_reverse (self) :
- """
- Yields decoded lines from the end of the file, in reverse order.
- """
-
- # partial lines
- buf = ''
-
- # read from end of file, a block at a time
- for block in self._read_blocks_reverse() :
- # add in our previous buf
- buf = block + buf
-
- # split up lines
- lines = buf.split(self.sep)
-
- # keep the first one as our buffer, as it's incomplete
- buf = lines[0]
-
- # yield the rest a line at a time in reverse order... this looks weird, but that's how slicing works :)
- # XXX: use something like islice, this has to build a slice object
- for line in lines[:0:-1] :
- yield self.decoder.decode(line)
-
- def read_latest (self, count) :
- """
- Returns up to count events, from the end of the file, or less, if the file doesn't contain that many lines.
- """
-
- # the list of lines
- lines = []
-
- # start reading lines into lines
- for line in self._read_lines_reverse() :
- # append
- lines.append(line)
-
- # done?
- if len(lines) >= count :
- break
-
- # decode in reverse order, using our starting date....
- # XXX: use lines[::-1] or reversed?
- # XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that
- return self.parser.parse_lines(self.channel, reversed(lines), self.start_date)
-
-class LogDirectory (LogSource) :
- """
- A directory containing a series of timestamped LogFiles
- """
-
- def __init__ (self, path, tz, parser, decoder, filename_fmt, channel=None) :
- """
- Load the logfiles at the given path, which are for the given LogChannel
-
- Decode the file lines using the given decoder, the files are named according the the date in the given
- timezone and date format, and will be parsed using the given parser.
- """
-
- # store
- self.channel = channel
- self.path = path
- self.tz = tz
- self.parser = parser
- self.decoder = decoder
- self.filename_fmt = filename_fmt
-
- def _get_logfile_date (self, d, load=True, mtime=False, ignore_missing=False) :
- """
- Get the logfile corresponding to the given naive date in our timezone.
-
- If load is False, only test for the presence of the logfile, do not actually open it. If mtime is given,
- then this returns the file's mtime
-
- Returns None if the logfile does not exist, unless ignore_missing is given as False.
- """
-
- # format filename
- filename = d.strftime(self.filename_fmt)
-
- # build path
- path = os.path.join(self.path, filename)
-
- try :
- if load :
- # open+return the LogFile
- return LogFile(path, self.parser, self.decoder, start_date=d, channel=self.channel)
-
- elif mtime :
- # stat
- return utils.mtime(path)
-
- else :
- # test
- return os.path.exists(path)
-
- # XXX: move to LogFile
- except IOError, e :
- # return None for missing files
- if e.errno == errno.ENOENT and ignore_missing :
- return None
-
- else :
- raise
-
- def _iter_logfile_dates (self, after=None, until=None, reverse=False) :
- """
- Yields a series of naive datetime objects representing the logfiles that are available, in time order.
-
- Parameters :
- after only dates from said date onwards will be returned
- until only dates up to and including said date will be returned
- reverse the dates are returned in reverse order instead. Note that the meaning of after/until doesn't change
- """
-
- # convert timestamps to our timezone's dates
- if after :
- after = after.astimezone(self.tz).date()
-
- if until :
- until = until.astimezone(self.tz).date()
-
- # listdir
- filenames = os.listdir(self.path)
-
- # sort
- filenames.sort(reverse=reverse)
-
- # iter files
- for filename in filenames :
- try :
- # parse date
- dt = self.tz.localize(datetime.datetime.strptime(filename, self.filename_fmt))
- date = dt.date()
-
- except :
- # ignore
- continue
-
- else :
- if (after and date < after) or (until and date > until) :
- # ignore
- continue
-
- else :
- # yield
- yield dt
-
- def _iter_date_reverse (self, dt=None) :
- """
- Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the
- given *datetime*, or the the current date, if none given
- """
-
- # default to now
- if not dt :
- dtz = self.tz.localize(datetime.datetime.now())
-
- else :
- # convert to target timezone
- dtz = dt.astimezone(self.tz)
-
- # iterate unto infinity
- while True :
- # yield
- yield dtz.date()
-
- # one day sdrawkcab
- dtz -= ONE_DAY
-
- def _iter_logfile_reverse (self, dt=None, max_files=100) :
- """
- Yields a series of LogFile objects, iterating backwards in time starting at the given datetime, or the
- current date, if none given.
-
- Reads/probes at most max_files files.
- """
-
- # start counting at zero...
- file_count = 0
-
- # have we found any files at all so far?
- have_found = False
-
- # iterate backwards over days
- for day in self._iter_date_reverse(dt) :
- # stop if we've handled enough files by now
- if file_count > max_files :
- break
-
- # try and open the next logfile
- logfile = None
-
- file_count += 1
- logfile = self._get_logfile_date(day, ignore_missing=True)
-
- # no logfile there?
- if not logfile :
- # hit our limit?
- if file_count > max_files :
- # if we didn't find any logfiles at all, terminate rudely
- if not have_found :
- raise Exception("No recent logfiles found")
-
- else :
- # stop looking, deal with what we've got
- return
-
- else :
- # skip to next day
- continue
-
- # mark have_found
- have_found = True
-
- # yield it
- yield logfile
-
- def get_latest (self, count) :
- """
- Uses _logfile_reverse to read the yield the given lines from as many logfiles as needed
- """
-
- # read the events into here
- lines = []
-
- # start reading in those logfiles
- for logfile in self._iter_logfile_reverse() :
- # read the events
- # XXX: use a queue
- lines = list(logfile.read_latest(count)) + lines
-
- # done?
- if len(lines) >= count :
- break
-
- # return the events
- return lines
-
- def get_date (self, dt) :
- """
- A 'day' is considered to be a 24-hour period from 00:00:00 23:59:59. If the timezone of the given datetime
- differs from our native datetime, this may involve lines from more than one logfile.
- """
-
- # begin/end of 24h period, in target timezone
- dtz_begin = dt.replace(hour=0, minute=0, second=0).astimezone(self.tz)
- dtz_end = dt.replace(hour=23, minute=59, second=59, microsecond=999999).astimezone(self.tz)
-
- # as dates
- d_begin = dtz_begin.date()
- d_end = dtz_end.date()
-
-# print
-# print "LogDirectory.get_date - %s" % dt
-# print "\t %s %s" % (d_begin, dtz_begin)
-# print "\t-> %s %s" % (d_end, dtz_end)
-
- # if they're the same, just pull the full log for that date
- if d_begin == d_end :
- # open that log
- logfile = self._get_logfile_date(d_begin)
-
- # return the full data
- return logfile.read_full()
-
- # otherwise, we need to pull two partial logs
- else :
- # open both of them, but it's okay if we don't have the second one
- f_begin = self._get_logfile_date(d_begin)
- f_end = self._get_logfile_date(d_end, ignore_missing=True)
-
- # chain together the two sources
- return itertools.chain(
- f_begin.read_from(dtz_begin),
- f_end.read_until(dtz_end) if f_end else []
- )
-
- def _iter_month_days (self, month) :
- """
- Iterates over the days of a month as dt objects with time=0
- """
-
- # there's at most 31 days in a month...
- for day in xrange(1, 32) :
- try :
- # try and build the datetime
- dt = datetime.datetime(month.year, month.month, day)
-
- except :
- # stop
- return
-
- else :
- # fix timezones + yield
- yield month.tzinfo.localize(dt)
-
- def get_month_days (self, month) :
- """
- Returns a set of dates for which logfiles are available in the given datetime's month
- """
-
- # iterate over month's days
- for dt in self._iter_month_days(month) :
- # date in our target timezone
- log_date = dt.astimezone(self.tz).date()
-
- # test for it
- if self._get_logfile_date(log_date, load=False, ignore_missing=True) :
- # valid
- yield dt.date()
-
- def get_modified (self, dt=None, after=None, until=None) :
- """
- Returns the contents off all logfiles with mtimes past the given date
- """
-
- # iterate through all available logfiles in date order, as datetimes, from the given date on
- for log_date in self._iter_logfile_dates(after, until) :
- # compare against dt?
- if dt :
- # stat
- mtime = self._get_logfile_date(log_date, load=False, mtime=True, ignore_missing=True)
-
- # not modified?
- if mtime < dt :
- # skip
- continue
-
- # open
- logfile = self._get_logfile_date(log_date)
-
- # yield all lines
- for line in logfile.read_full() :
- yield line
-
- def get_prev_date (self, dt) :
- """
- Just use _iter_logfile_dates
- """
-
- # use for to "iter" once
- for log_date in self._iter_logfile_dates(until=dt - ONE_DAY, reverse=True) :
- return log_date
-
- else :
- return None
-
- def get_next_date (self, dt) :
- """
- Just use _iter_logfile_dates
- """
-
- # use for to "iter" once
- for log_date in self._iter_logfile_dates(after=dt + ONE_DAY) :
- return log_date
-
- else :
- return None
-