log_source.py
changeset 46 185504387370
parent 43 fc11c4e86a82
child 48 7858b7b8ffe3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/log_source.py	Sun Feb 08 03:23:25 2009 +0200
@@ -0,0 +1,231 @@
+"""
+    A source of IRC log files
+"""
+
+import codecs
+from datetime import date, datetime, timedelta
+import pytz
+
+# for SEEK_*, errno
+import os, errno
+
+class LogSource (object) :
+    """
+        A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
+    """
+    
+    def get_latest (self, count) :
+        """
+            Yield the latest events, up to `count` of them.
+        """
+
+        abstract
+
+class LogFile (LogSource) :
+    """
+        A file containing LogEvents
+    """
+
+    def __init__ (self, path, charset='utf-8', sep='\n') :
+        """
+            Open the file at the given path, which contains data of the given codec, as lines separated by the given separator
+        """
+        
+        # store
+        self.path = path
+        self.charset = charset
+        self.sep = sep
+
+        # open
+        self.file = codecs.open(path, 'r', charset)
+    
+    def __iter__ (self) :
+        """
+            Yields a series of lines, as read from the top of the file
+        """
+        
+        # seek to beginning
+        self.file.seek(0)
+
+        # iterate over lines
+        return iter(self.file)
+    
+    def get_latest (self, count) :
+        """
+            Returns up to <count> lines from the end of the file, or less, if the file doesn't contain that many lines
+        """
+
+        # the list of lines
+        lines = []
+
+        # seek to end of file
+        self.file.seek(0, os.SEEK_END)
+
+        # read offset
+        # XXX; why -2 ?
+        size = offset = self.file.tell() - 2
+
+        # use this blocksize
+        BLOCKSIZE = 1024
+
+        # trailing data
+        buf = ''
+
+        # read a block at a time, backwards
+        while  count > 0 and offset >= 0:
+            # update offset back one block
+            offset -= BLOCKSIZE
+
+            # normalize to zero
+            if offset < 0 :
+                offset = 0
+
+            # seek to offset
+            self.file.seek(offset)
+
+            # add the new block to our buffer
+            read_buf = self.file.read(BLOCKSIZE)
+
+            # XXX: trim off extra...
+            if len(read_buf) > BLOCKSIZE :
+                read_buf = read_buf[:BLOCKSIZE]
+
+            # make sure we got the right amount of data
+            assert len(read_buf) == BLOCKSIZE, "read(%d) @ %d/%d -> %d" % (BLOCKSIZE, offset, size, len(read_buf))
+
+            # add in our previous buf
+            buf = read_buf + buf
+            
+            # split out lines
+            buf_lines = buf.split(self.sep)
+
+            # keep the first one as our buffer, as it's incomplete
+            buf = buf_lines[0]
+
+            # add up to count lines to our lines buffer
+            lines = buf_lines[-min(count, len(buf_lines) - 1):] + lines
+
+            # update count
+            count -= (len(buf_lines) - 1)
+
+        # return the line list
+        return lines
+
+class LogDirectory (LogSource) :
+    """
+        A directory containing a series of timestamped LogFiles
+    """
+
+    def __init__ (self, path, tz, charset='utf-8', filename_fmt='%Y-%m-%d') :
+        """
+            Load the logfiles at the given path.
+            
+            The files contain data in the given charset, and are named according the the date in the given timezone and
+            date format.
+        """
+
+        # store
+        self.path = path
+        self.tz = tz
+        self.charset = charset
+        self.filename_fmt = filename_fmt
+
+    def _get_logfile_datetime (self, dt) :
+        """
+            Get the logfile corresponding to the given datetime
+        """
+
+        # convert to target timezone
+        dtz = dt.astimezone(self.tz)
+        
+        # convert to date and use that
+        return self._get_logfile_date(dtz.date())
+
+    def _get_logfile_date (self, d) :
+        """
+            Get the logfile corresponding to the given naive date in our timezone
+        """
+
+        # format filename
+        filename = d.strftime(self.filename_fmt)
+
+        # build path
+        path = os.path.join(self.path, filename)
+
+        # return the LogFile
+        return LogFile(path, self.charset)
+    
+    def _iter_backwards (self, dt=None) :
+        """
+            Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the
+            given *datetime*, or the the current date, if none given
+        """
+        
+        # default to now
+        if not dt :
+            dt = datetime.now(pytz.utc)
+        
+        # convert to target timezone
+        dtz = dt.astimezone(self.tz)
+
+        # our timedelta
+        ONE_DAY = timedelta(1)
+        
+        # iterate unto infinity
+        while True :
+            # yield
+            yield dtz.date()
+            
+            # one day sdrawkcab
+            dtz -= ONE_DAY
+    
+    def get_latest (self, count) :
+        """
+            Uses _iter_backwards + _get_logfile_date to read the yield the given lines from as many logfiles as needed
+        """
+        
+        # iterate backwards from now
+        day_iter = self._iter_backwards()
+
+        # number of files read
+        files = 0
+
+        # only read up to 100 files or so
+        MAX_FILES = 100
+        
+        # loop until done
+        while count > 0 :
+            logfile = None
+
+            try :
+                # get next logfile
+                files += 1
+                
+                # open
+                logfile = self._get_logfile_date(day_iter.next())
+            
+            except IOError, e :
+                # skip nonexistant days if we haven't found any logs yet
+                if e.errno != errno.ENOENT :
+                    raise
+
+                if files > MAX_FILES :
+                    raise Exception("No recent logfiles found")
+                
+                else :
+                    # skip to next day
+                    continue
+
+            # yield lines
+            for line in logfile.get_latest(count) :
+                # yield while we still need to, otherwise, stop
+                if count > 0 :
+                    # decrement
+                    count -= 1
+ 
+                    yield line
+            
+                else :
+                    break
+
+