sites/irclogs.qmsk.net/log_source.py
author Tero Marttila <terom@fixme.fi>
Sun, 08 Feb 2009 02:55:53 +0200
branchsites
changeset 43 fc11c4e86a82
parent 41 9585441a4bfb
permissions -rw-r--r--
implement channel_view count, the query stuff, css, layout all need some cleanup :(
"""
    A source of IRC log files
"""

import codecs
from datetime import date, datetime, timedelta
import pytz

# for SEEK_*, errno
import os, errno

class LogSource (object) :
    """
        A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
    """
    
    def get_latest (self, count) :
        """
            Yield the latest events, up to `count` of them.
        """

        abstract

class LogFile (LogSource) :
    """
        A file containing LogEvents
    """

    def __init__ (self, path, charset='utf-8', sep='\n') :
        """
            Open the file at the given path, which contains data of the given codec, as lines separated by the given separator
        """
        
        # store
        self.path = path
        self.charset = charset
        self.sep = sep

        # open
        self.file = codecs.open(path, 'r', charset)
    
    def __iter__ (self) :
        """
            Yields a series of lines, as read from the top of the file
        """
        
        # seek to beginning
        self.file.seek(0)

        # iterate over lines
        return iter(self.file)
    
    def get_latest (self, count) :
        """
            Returns up to <count> lines from the end of the file, or less, if the file doesn't contain that many lines
        """

        # the list of lines
        lines = []

        # seek to end of file
        self.file.seek(0, os.SEEK_END)

        # read offset
        # XXX; why -2 ?
        size = offset = self.file.tell() - 2

        # use this blocksize
        BLOCKSIZE = 1024

        # trailing data
        buf = ''

        # read a block at a time, backwards
        while  count > 0 and offset >= 0:
            # update offset back one block
            offset -= BLOCKSIZE

            # normalize to zero
            if offset < 0 :
                offset = 0

            # seek to offset
            self.file.seek(offset)

            # add the new block to our buffer
            read_buf = self.file.read(BLOCKSIZE)

            # XXX: trim off extra...
            if len(read_buf) > BLOCKSIZE :
                read_buf = read_buf[:BLOCKSIZE]

            # make sure we got the right amount of data
            assert len(read_buf) == BLOCKSIZE, "read(%d) @ %d/%d -> %d" % (BLOCKSIZE, offset, size, len(read_buf))

            # add in our previous buf
            buf = read_buf + buf
            
            # split out lines
            buf_lines = buf.split(self.sep)

            # keep the first one as our buffer, as it's incomplete
            buf = buf_lines[0]

            # add up to count lines to our lines buffer
            lines = buf_lines[-min(count, len(buf_lines) - 1):] + lines

            # update count
            count -= (len(buf_lines) - 1)

        # return the line list
        return lines

class LogDirectory (LogSource) :
    """
        A directory containing a series of timestamped LogFiles
    """

    def __init__ (self, path, tz, charset='utf-8', filename_fmt='%Y-%m-%d') :
        """
            Load the logfiles at the given path.
            
            The files contain data in the given charset, and are named according the the date in the given timezone and
            date format.
        """

        # store
        self.path = path
        self.tz = tz
        self.charset = charset
        self.filename_fmt = filename_fmt

    def _get_logfile_datetime (self, dt) :
        """
            Get the logfile corresponding to the given datetime
        """

        # convert to target timezone
        dtz = dt.astimezone(self.tz)
        
        # convert to date and use that
        return self._get_logfile_date(dtz.date())

    def _get_logfile_date (self, d) :
        """
            Get the logfile corresponding to the given naive date in our timezone
        """

        # format filename
        filename = d.strftime(self.filename_fmt)

        # build path
        path = os.path.join(self.path, filename)

        # return the LogFile
        return LogFile(path, self.charset)
    
    def _iter_backwards (self, dt=None) :
        """
            Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the
            given *datetime*, or the the current date, if none given
        """
        
        # default to now
        if not dt :
            dt = datetime.now(pytz.utc)
        
        # convert to target timezone
        dtz = dt.astimezone(self.tz)

        # our timedelta
        ONE_DAY = timedelta(1)
        
        # iterate unto infinity
        while True :
            # yield
            yield dtz.date()
            
            # one day sdrawkcab
            dtz -= ONE_DAY
    
    def get_latest (self, count) :
        """
            Uses _iter_backwards + _get_logfile_date to read the yield the given lines from as many logfiles as needed
        """
        
        # iterate backwards from now
        day_iter = self._iter_backwards()

        # number of files read
        files = 0

        # only read up to 100 files or so
        MAX_FILES = 100
        
        # loop until done
        while count > 0 :
            logfile = None

            try :
                # get next logfile
                files += 1
                
                # open
                logfile = self._get_logfile_date(day_iter.next())
            
            except IOError, e :
                # skip nonexistant days if we haven't found any logs yet
                if e.errno != errno.ENOENT :
                    raise

                if files > MAX_FILES :
                    raise Exception("No recent logfiles found")
                
                else :
                    # skip to next day
                    continue

            # yield lines
            for line in logfile.get_latest(count) :
                # yield while we still need to, otherwise, stop
                if count > 0 :
                    # decrement
                    count -= 1
 
                    yield line
            
                else :
                    break