--- a/log_source.py Sun Feb 08 04:59:22 2009 +0200
+++ b/log_source.py Mon Feb 09 00:24:13 2009 +0200
@@ -2,13 +2,10 @@
A source of IRC log files
"""
-import codecs
-from datetime import date, datetime, timedelta
+import datetime, itertools
+import os, errno
import pytz
-# for SEEK_*, errno
-import os, errno
-
class LogSource (object) :
"""
A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
@@ -20,19 +17,30 @@
"""
abstract
+
+ def get_date (self, dt) :
+ """
+ Get logs for the given date (as a datetime)
+ """
+
+ abstract
class LogFile (LogSource) :
"""
A file containing LogEvents
"""
- def __init__ (self, path, charset='utf-8', sep='\n') :
+ def __init__ (self, path, parser, start_date=None, charset='utf-8', sep='\n') :
"""
- Open the file at the given path, which contains data of the given codec, as lines separated by the given separator
+ Open the file at the given path, which contains data with the given charset, as lines separated by the
+ given separator. Lines are parsed using the given parser, using the given date as an initial date, see
+ LogParser for more info. XXX: currently we assume start_date also for the end of the file
"""
# store
self.path = path
+ self.parser = parser
+ self.start_date = start_date
self.charset = charset
self.sep = sep
@@ -41,22 +49,68 @@
def __iter__ (self) :
"""
- Yields a series of lines, as read from the top of the file
+ Yields a series of unicode lines, as read from the top of the file
"""
# seek to beginning
self.file.seek(0)
- # iterate over lines
- return iter(self.file)
+ # iterate over lines, decoding them as well
+ return (line.decode(self.charset) for line in self.file)
- def get_latest (self, count) :
+ def read_full (self) :
"""
- Returns up to <count> lines from the end of the file, or less, if the file doesn't contain that many lines
+ Reads all LogLines
+ """
+
+ # just use our __iter__
+ return self.parser.parse_lines(self, self.start_date)
+
+ def read_from (self, dt) :
+ """
+ Reads all LogLines from the given naive timestamp onwards
+ """
+
+ # start reading at beginning
+ events = self.read_full()
+
+ # skip unwanted events
+ for event in events :
+ if event.timestamp < dt :
+ continue
+
+ else :
+ # include this line as well
+ yield event
+ break
+
+ # yield the rest as-is
+ for event in events :
+ yield event
+
+ def read_until (self, dt) :
+ """
+ Reads all LogLines up until the given naive timestamp
"""
- # the list of lines
- lines = []
+ # start reading events at the beginning
+ events = self.read_full()
+
+ # yield events until we hit the given timestamp
+ for event in events :
+ if event.timestamp <= dt :
+ yield event
+
+ else :
+ break
+
+ # ignore the rest
+ return
+
+ def _read_blocks_reverse (self, blocksize=1024) :
+ """
+ Yields blocks of file data in reverse order, starting at the end of the file
+ """
# seek to end of file
self.file.seek(0, os.SEEK_END)
@@ -64,20 +118,14 @@
# read offset
# XXX: hack -1 to get rid of trailing newline
size = offset = self.file.tell() - 1
-
- # use this blocksize
- BLOCKSIZE = 1024
-
- # trailing data
- buf = ''
-
- # read a block at a time, backwards
- while len(lines) < count and offset > 0:
+
+ # do not try to read past the beginning of the file
+ while offset > 0:
# calc new offset + size
- if offset > BLOCKSIZE :
+ if offset > blocksize :
# full block
- offset -= BLOCKSIZE
- read_size = BLOCKSIZE
+ offset -= blocksize
+ read_size = blocksize
else :
# partial block
@@ -88,47 +136,77 @@
self.file.seek(offset)
# read the data we want
- read_buf = self.file.read(read_size)
- read_len = len(read_buf)
+ block = self.file.read(read_size)
# sanity check
- assert read_len == read_size
+ assert len(block) == read_size
+ # yield
+ yield block
+
+ def _read_lines_reverse (self) :
+ """
+ Yields decoded lines from the end of the file, in reverse order.
+ """
+
+ # partial lines
+ buf = ''
+
+ # read from end of file, a block at a time
+ for block in self._read_blocks_reverse() :
# add in our previous buf
- buf = read_buf + buf
+ buf = block + buf
- # split out lines
- buf_lines = buf.split(self.sep)
+ # split up lines
+ lines = buf.split(self.sep)
# keep the first one as our buffer, as it's incomplete
- buf = buf_lines[0]
+ buf = lines[0]
+
+ # yield the rest a line at a time in reverse order... this looks weird, but that's how slicing works :)
+ # XXX: use something like islice, this has to build a slice object
+ for line in lines[:0:-1] :
+ yield line.decode(self.charset)
- # prepend up to count lines from the end to our lines buffer
- lines = buf_lines[-min(count, len(buf_lines) - 1):] + lines
+ def get_latest (self, count) :
+ """
+ Returns up to count events, from the end of the file, or less, if the file doesn't contain that many lines.
+ """
+
+ # the list of lines
+ lines = []
+
+ # start reading lines into lines
+ for line in self._read_lines_reverse() :
+ # append
+ lines.append(line)
+
+ # done?
+ if len(lines) >= count :
+ break
- # decode
- # XXX: better queue implementation, plz
- lines = [line.decode(self.charset) for line in lines]
-
- # return the line list
- return lines
+ # decode in reverse order, using our starting date....
+ # XXX: use lines[::-1] or reversed?
+ # XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that
+ return self.parser.parse_lines(reversed(lines), self.start_date)
class LogDirectory (LogSource) :
"""
A directory containing a series of timestamped LogFiles
"""
- def __init__ (self, path, tz, charset='utf-8', filename_fmt='%Y-%m-%d') :
+ def __init__ (self, path, tz, parser, charset='utf-8', filename_fmt='%Y-%m-%d') :
"""
Load the logfiles at the given path.
The files contain data in the given charset, and are named according the the date in the given timezone and
- date format.
+ date format, and will be parsed using the given parser.
"""
# store
self.path = path
self.tz = tz
+ self.parser = parser
self.charset = charset
self.filename_fmt = filename_fmt
@@ -155,9 +233,9 @@
path = os.path.join(self.path, filename)
# return the LogFile
- return LogFile(path, self.charset)
+ return LogFile(path, self.parser, d, self.charset)
- def _iter_backwards (self, dt=None) :
+ def _iter_date_reverse (self, dt=None) :
"""
Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the
given *datetime*, or the the current date, if none given
@@ -165,13 +243,13 @@
# default to now
if not dt :
- dt = datetime.now(pytz.utc)
+ dt = datetime.datetime.now(pytz.utc)
# convert to target timezone
dtz = dt.astimezone(self.tz)
# our timedelta
- ONE_DAY = timedelta(1)
+ ONE_DAY = datetime.timedelta(1)
# iterate unto infinity
while True :
@@ -187,7 +265,7 @@
"""
# iterate backwards from now
- day_iter = self._iter_backwards()
+ day_iter = self._iter_date_reverse()
# number of files read
files = 0
@@ -195,7 +273,7 @@
# only read up to 100 files or so
MAX_FILES = 100
- # read the lines into here
+ # read the events into here
lines = []
# loop until done
@@ -221,9 +299,37 @@
# skip to next day
continue
- # read the lines
- lines = logfile.get_latest(count) + lines
+ # read the events
+ # XXX: use a queue
+ lines = list(logfile.get_latest(count)) + lines
- # return the lines
+ # return the events
return lines
+ def get_date (self, dt) :
+ """
+ A 'day' is considered to be a 24-hour period from 00:00:00 23:59:59. If the timezone of the given datetime
+ differs from our native datetime, this may involve lines from more than one logfile.
+ """
+
+ # begin/end of 24h period, in target timezone
+ dtz_begin = dt.replace(hour=0, minute=0, second=0).astimezone(self.tz)
+ dtz_end = dt.replace(hour=23, minute=59, second=59, microsecond=999999).astimezone(self.tz)
+
+ # as dates
+ d_begin = dtz_begin.date()
+ d_end = dtz_end.date()
+
+ # if they're the same, just pull the full log for that date
+ if d_begin == d_end :
+ return self._get_logfile_date(d_begin).read_full()
+
+ # otherwise, we need to pull two partial logs
+ else :
+ # open both of them
+ f_begin = self._get_logfile_date(d_begin)
+ f_end = self._get_logfile_date(d_end)
+
+ # chain together the two sources
+ return itertools.chain(f_begin.read_from(dtz_begin), f_end.read_until(dtz_end))
+