"""
A source of IRC log files
"""
import datetime, calendar, itertools, functools, math
import os, os.path, errno
import pytz
import config, utils
# a timedelta that represents one day
ONE_DAY = datetime.timedelta(days=1)
class LogSourceDecoder (object) :
"""
Handles decoding of LogSource lines
"""
def __init__ (self, encoding_list) :
"""
Will try each of the given (charset, errors) items in turn, until one succeeds
"""
self.encoding_list = encoding_list
def decode (self, line) :
"""
Decode the line of str() text into an unicode object
"""
# list of errors encountered
error_list = []
# try each in turn
for charset, errors in self.encoding_list :
# trap UnicodeDecodeError to try with the next one
try :
return line.decode(charset, errors)
except UnicodeDecodeError, e :
error_list.append("%s:%s - %s" % (charset, errors, e))
continue
# failure
raise UnicodeDecodeError("Failed to decode line: %r: %s" % (line, ', '.join(error_list)))
class LogSource (object) :
"""
A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
"""
def __init__ (self, decoder, channel=None) :
"""
The appropriate LogChannel must be given, as we need to be able to construct the LogLines. If it is not yet
known, then it can be given as None, and set later with bind_channel.
Uses the given LogSourceDecoder to decode the lines.
"""
self.channel = channel
self.decoder = decoder
def bind_channel (self, channel) :
"""
Set this source's channel, where None was set before
"""
assert not self.channel
self.channel = channel
def get_latest (self, count) :
"""
Yield the latest events, up to `count` of them.
"""
abstract
def get_date (self, dt) :
"""
Get logs for the given date (as a datetime).
"""
abstract
def get_date_paged (self, dt, count, page=None) :
"""
Get the logs for a given date (as a datetime), divided into pages of count each. If page is given, the time
portion of the dt is ignored, and the lines for the given page are returned. Otherwise, if page is None,
then the lines for the page containing the given timestamp is returned.
dt - the date to get logs for. If page is None, this is also the specific timestamp to page to
count - number of lines per page to return
page - specific page to return, or None to pick the right page for the given datetime
The return value is a (page, max, lines) tuple.
page - the selected page
max - total number of pages
lines - the sequence of lines for the selected page
"""
# how to act?
if page :
# constant skip
skip = (page - 1) * count
else :
skip = None
# go through the logs a page at a time
this_page = 1
# last line's timestamp
last_ts = None
# found it yet?
found = False
# count the full number of lines
line_count = 0
# collect lines
lines = []
# iterate using get_date
for line in self.get_date(dt) :
# count them
line_count += 1
# skip?
if skip :
skip -= 1
continue
# is this page all that we want/need?
if page or found :
# already full?
if len(lines) >= count :
continue
# specfic timestamp
else :
# didn't find it in this page?
if len(lines) >= count :
# reset to next page
lines = []
this_page += 1
# is dt between these two timestamps?
if (not last_ts or last_ts <= dt) and (dt <= line.timestamp) :
# found!
found = True
page = this_page
else :
# keep looking
last_ts = line.timestamp
# store line
lines.append(line)
# calculate max_pages
max_pages = int(math.ceil(float(line_count) / count))
# return
return (page, max_pages, lines)
def get_month_days (self, dt) :
"""
Return an ordered sequence of dates, telling which days in the given month (as a datetime) have logs available.
"""
abstract
def get_modified (self, dt=None, after=None, until=None) :
"""
Returns a sequence of LogLines that may have been *modified* from their old values since the given datetime.
If the datetime is not given, *all* lines are returned.
If after is given, only lines from said date onwards will be returned, regardless of modification.
If until is given, only lines up to and including said date will be returned, regardless of modification.
The LogLines should be in time order.
"""
abstract
def get_prev_date (self, dt) :
"""
Get the next distinct date of logs available preceeding the given date, or None
"""
abstract
def get_next_date (self, dt) :
"""
Get the next distinct date of logs following the given date, or None.
"""
abstract
class LogFile (object) :
"""
A file containing LogEvents
XXX: modify to implement LogSource?
"""
def __init__ (self, path, parser, decoder, channel=None, start_date=None, sep='\n') :
"""
Open the file at the given path, which contains lines as separated by the given separator. Lines are
decoded using the given LogSourceDecoder, and then parsed using the given parser, using the given date
as the initial date for this log's first line.
XXX: currently we assume start_date also for the end of the file
"""
# store
self.channel = channel
self.path = path
self.parser = parser
self.start_date = start_date
self.decoder = decoder
self.sep = sep
# open
self.file = open(path, 'rb')
def __iter__ (self) :
"""
Yields a series of unicode lines, as read from the top of the file
"""
# seek to beginning
self.file.seek(0)
# iterate over lines, decoding them as well
return (self.decoder.decode(line.rstrip(self.sep)) for line in self.file)
def read_full (self) :
"""
Reads all LogLines. The LogLines will have a valid offset.
"""
# just use our __iter__
return self.parser.parse_lines(self.channel, self, self.start_date, starting_offset=1)
def read_from (self, dt) :
"""
Reads all LogLines from the given naive timestamp onwards
"""
# start reading at beginning
events = self.read_full()
# skip unwanted events
for event in events :
if event.timestamp < dt :
continue
else :
# include this line as well
yield event
break
# yield the rest as-is
for event in events :
yield event
def read_until (self, dt) :
"""
Reads all LogLines up until the given naive timestamp
"""
# start reading events at the beginning
events = self.read_full()
# yield events until we hit the given timestamp
for event in events :
if event.timestamp <= dt :
yield event
else :
break
# ignore the rest
return
def _read_blocks_reverse (self, blocksize=1024) :
"""
Yields blocks of file data in reverse order, starting at the end of the file
"""
# seek to end of file
self.file.seek(0, os.SEEK_END)
# read offset
# XXX: hack -1 to get rid of trailing newline
size = offset = self.file.tell() - 1
# do not try to read past the beginning of the file
while offset > 0:
# calc new offset + size
if offset > blocksize :
# full block
offset -= blocksize
read_size = blocksize
else :
# partial block
read_size = offset
offset = 0
# seek to offset
self.file.seek(offset)
# read the data we want
block = self.file.read(read_size)
# sanity check
assert len(block) == read_size
# yield
yield block
def _read_lines_reverse (self) :
"""
Yields decoded lines from the end of the file, in reverse order.
"""
# partial lines
buf = ''
# read from end of file, a block at a time
for block in self._read_blocks_reverse() :
# add in our previous buf
buf = block + buf
# split up lines
lines = buf.split(self.sep)
# keep the first one as our buffer, as it's incomplete
buf = lines[0]
# yield the rest a line at a time in reverse order... this looks weird, but that's how slicing works :)
# XXX: use something like islice, this has to build a slice object
for line in lines[:0:-1] :
yield self.decoder.decode(line)
def read_latest (self, count) :
"""
Returns up to count events, from the end of the file, or less, if the file doesn't contain that many lines.
"""
# the list of lines
lines = []
# start reading lines into lines
for line in self._read_lines_reverse() :
# append
lines.append(line)
# done?
if len(lines) >= count :
break
# decode in reverse order, using our starting date....
# XXX: use lines[::-1] or reversed?
# XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that
return self.parser.parse_lines(self.channel, reversed(lines), self.start_date)
class LogDirectory (LogSource) :
"""
A directory containing a series of timestamped LogFiles
"""
def __init__ (self, path, tz, parser, decoder, filename_fmt, channel=None) :
"""
Load the logfiles at the given path, which are for the given LogChannel
Decode the file lines using the given decoder, the files are named according the the date in the given
timezone and date format, and will be parsed using the given parser.
"""
# store
self.channel = channel
self.path = path
self.tz = tz
self.parser = parser
self.decoder = decoder
self.filename_fmt = filename_fmt
def _get_logfile_date (self, d, load=True, mtime=False, ignore_missing=False) :
"""
Get the logfile corresponding to the given naive date in our timezone.
If load is False, only test for the presence of the logfile, do not actually open it. If mtime is given,
then this returns the file's mtime
Returns None if the logfile does not exist, unless ignore_missing is given as False.
"""
# format filename
filename = d.strftime(self.filename_fmt)
# build path
path = os.path.join(self.path, filename)
try :
if load :
# open+return the LogFile
return LogFile(path, self.parser, self.decoder, start_date=d, channel=self.channel)
elif mtime :
# stat
return utils.mtime(path)
else :
# test
return os.path.exists(path)
# XXX: move to LogFile
except IOError, e :
# return None for missing files
if e.errno == errno.ENOENT and ignore_missing :
return None
else :
raise
def _iter_logfile_dates (self, after=None, until=None, reverse=False) :
"""
Yields a series of naive datetime objects representing the logfiles that are available, in time order.
Parameters :
after only dates from said date onwards will be returned
until only dates up to and including said date will be returned
reverse the dates are returned in reverse order instead. Note that the meaning of after/until doesn't change
"""
# convert timestamps to our timezone's dates
if after :
after = after.astimezone(self.tz).date()
if until :
until = until.astimezone(self.tz).date()
# listdir
filenames = os.listdir(self.path)
# sort
filenames.sort(reverse=reverse)
# iter files
for filename in filenames :
try :
# parse date
dt = self.tz.localize(datetime.datetime.strptime(filename, self.filename_fmt))
date = dt.date()
except :
# ignore
continue
else :
if (after and date < after) or (until and date > until) :
# ignore
continue
else :
# yield
yield dt
def _iter_date_reverse (self, dt=None) :
"""
Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the
given *datetime*, or the the current date, if none given
"""
# default to now
if not dt :
dtz = self.tz.localize(datetime.datetime.now())
else :
# convert to target timezone
dtz = dt.astimezone(self.tz)
# iterate unto infinity
while True :
# yield
yield dtz.date()
# one day sdrawkcab
dtz -= ONE_DAY
def _iter_logfile_reverse (self, dt=None, max_files=100) :
"""
Yields a series of LogFile objects, iterating backwards in time starting at the given datetime, or the
current date, if none given.
Reads/probes at most max_files files.
"""
# start counting at zero...
file_count = 0
# have we found any files at all so far?
have_found = False
# iterate backwards over days
for day in self._iter_date_reverse(dt) :
# stop if we've handled enough files by now
if file_count > max_files :
break
# try and open the next logfile
logfile = None
file_count += 1
logfile = self._get_logfile_date(day, ignore_missing=True)
# no logfile there?
if not logfile :
# hit our limit?
if file_count > max_files :
# if we didn't find any logfiles at all, terminate rudely
if not have_found :
raise Exception("No recent logfiles found")
else :
# stop looking, deal with what we've got
return
else :
# skip to next day
continue
# mark have_found
have_found = True
# yield it
yield logfile
def get_latest (self, count) :
"""
Uses _logfile_reverse to read the yield the given lines from as many logfiles as needed
"""
# read the events into here
lines = []
# start reading in those logfiles
for logfile in self._iter_logfile_reverse() :
# read the events
# XXX: use a queue
lines = list(logfile.read_latest(count)) + lines
# done?
if len(lines) >= count :
break
# return the events
return lines
def get_date (self, dt) :
"""
A 'day' is considered to be a 24-hour period from 00:00:00 23:59:59. If the timezone of the given datetime
differs from our native datetime, this may involve lines from more than one logfile.
"""
# begin/end of 24h period, in target timezone
dtz_begin = dt.replace(hour=0, minute=0, second=0).astimezone(self.tz)
dtz_end = dt.replace(hour=23, minute=59, second=59, microsecond=999999).astimezone(self.tz)
# as dates
d_begin = dtz_begin.date()
d_end = dtz_end.date()
# print
# print "LogDirectory.get_date - %s" % dt
# print "\t %s %s" % (d_begin, dtz_begin)
# print "\t-> %s %s" % (d_end, dtz_end)
# if they're the same, just pull the full log for that date
if d_begin == d_end :
# open that log
logfile = self._get_logfile_date(d_begin)
# return the full data
return logfile.read_full()
# otherwise, we need to pull two partial logs
else :
# open both of them, but it's okay if we don't have the second one
f_begin = self._get_logfile_date(d_begin)
f_end = self._get_logfile_date(d_end, ignore_missing=True)
# chain together the two sources
return itertools.chain(
f_begin.read_from(dtz_begin),
f_end.read_until(dtz_end) if f_end else []
)
def _iter_month_days (self, month) :
"""
Iterates over the days of a month as dt objects with time=0
"""
# there's at most 31 days in a month...
for day in xrange(1, 32) :
try :
# try and build the datetime
dt = datetime.datetime(month.year, month.month, day)
except :
# stop
return
else :
# fix timezones + yield
yield month.tzinfo.localize(dt)
def get_month_days (self, month) :
"""
Returns a set of dates for which logfiles are available in the given datetime's month
"""
# iterate over month's days
for dt in self._iter_month_days(month) :
# date in our target timezone
log_date = dt.astimezone(self.tz).date()
# test for it
if self._get_logfile_date(log_date, load=False, ignore_missing=True) :
# valid
yield dt.date()
def get_modified (self, dt=None, after=None, until=None) :
"""
Returns the contents off all logfiles with mtimes past the given date
"""
# iterate through all available logfiles in date order, as datetimes, from the given date on
for log_date in self._iter_logfile_dates(after, until) :
# compare against dt?
if dt :
# stat
mtime = self._get_logfile_date(log_date, load=False, mtime=True, ignore_missing=True)
# not modified?
if mtime < dt :
# skip
continue
# open
logfile = self._get_logfile_date(log_date)
# yield all lines
for line in logfile.read_full() :
yield line
def get_prev_date (self, dt) :
"""
Just use _iter_logfile_dates
"""
# use for to "iter" once
for log_date in self._iter_logfile_dates(until=dt - ONE_DAY, reverse=True) :
return log_date
else :
return None
def get_next_date (self, dt) :
"""
Just use _iter_logfile_dates
"""
# use for to "iter" once
for log_date in self._iter_logfile_dates(after=dt + ONE_DAY) :
return log_date
else :
return None