log_source.py
author Tero Marttila <terom@fixme.fi>
Mon, 09 Feb 2009 01:11:05 +0200
changeset 51 07ca28f3a9f2
parent 50 f13cf27a360b
child 54 b65a95eb9f6b
permissions -rw-r--r--
use improved URLConfig/URLType
"""
    A source of IRC log files
"""

import datetime, itertools
import os, errno
import pytz

class LogSource (object) :
    """
        A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
    """
    
    def get_latest (self, count) :
        """
            Yield the latest events, up to `count` of them.
        """

        abstract
    
    def get_date (self, dt) :
        """
            Get logs for the given date (as a datetime)
        """

        abstract

class LogFile (LogSource) :
    """
        A file containing LogEvents
    """

    def __init__ (self, path, parser, start_date=None, charset='utf-8', sep='\n') :
        """
            Open the file at the given path, which contains data with the given charset, as lines separated by the
            given separator. Lines are parsed using the given parser, using the given date as an initial date, see
            LogParser for more info. XXX: currently we assume start_date also for the end of the file
        """
        
        # store
        self.path = path
        self.parser = parser
        self.start_date = start_date
        self.charset = charset
        self.sep = sep

        # open
        self.file = open(path, 'rb')
    
    def __iter__ (self) :
        """
            Yields a series of unicode lines, as read from the top of the file
        """
        
        # seek to beginning
        self.file.seek(0)

        # iterate over lines, decoding them as well
        return (line.decode(self.charset) for line in self.file)
    
    def read_full (self) :
        """
            Reads all LogLines
        """
        
        # just use our __iter__
        return self.parser.parse_lines(self, self.start_date)

    def read_from (self, dt) :
        """
            Reads all LogLines from the given naive timestamp onwards
        """
        
        # start reading at beginning
        events = self.read_full()
        
        # skip unwanted events
        for event in events :
            if event.timestamp < dt :
                continue

            else :
                # include this line as well
                yield event
                break
        
        # yield the rest as-is
        for event in events :
            yield event

    def read_until (self, dt) :
        """
            Reads all LogLines up until the given naive timestamp
        """

        # start reading events at the beginning
        events = self.read_full()

        # yield events until we hit the given timestamp
        for event in events :
            if event.timestamp <= dt :
                yield event

            else :
                break
            
        # ignore the rest
        return

    def _read_blocks_reverse (self, blocksize=1024) :
        """
            Yields blocks of file data in reverse order, starting at the end of the file
        """

        # seek to end of file
        self.file.seek(0, os.SEEK_END)

        # read offset
        # XXX: hack -1 to get rid of trailing newline
        size = offset = self.file.tell() - 1
        
        # do not try to read past the beginning of the file
        while offset > 0:
            # calc new offset + size
            if offset > blocksize :
                # full block
                offset -= blocksize
                read_size = blocksize

            else :
                # partial block
                read_size = offset
                offset = 0

            # seek to offset
            self.file.seek(offset)

            # read the data we want
            block = self.file.read(read_size)

            # sanity check
            assert len(block) == read_size

            # yield 
            yield block
    
    def _read_lines_reverse (self) :
        """
            Yields decoded lines from the end of the file, in reverse order.
        """

        # partial lines
        buf = ''
        
        # read from end of file, a block at a time
        for block in self._read_blocks_reverse() :
            # add in our previous buf
            buf = block + buf
            
            # split up lines
            lines = buf.split(self.sep)

            # keep the first one as our buffer, as it's incomplete
            buf = lines[0]
           
            # yield the rest a line at a time in reverse order... this looks weird, but that's how slicing works :)
            # XXX: use something like islice, this has to build a slice object
            for line in lines[:0:-1] :
                yield line.decode(self.charset)

    def get_latest (self, count) :
        """
            Returns up to count events, from the end of the file, or less, if the file doesn't contain that many lines.
        """

        # the list of lines
        lines = []

        # start reading lines into lines
        for line in self._read_lines_reverse() :
            # append
            lines.append(line)

            # done?
            if len(lines) >= count :
                break
        
        # decode in reverse order, using our starting date....
        # XXX: use lines[::-1] or reversed?
        # XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that
        return self.parser.parse_lines(reversed(lines), self.start_date)

class LogDirectory (LogSource) :
    """
        A directory containing a series of timestamped LogFiles
    """

    def __init__ (self, path, tz, parser, charset='utf-8', filename_fmt='%Y-%m-%d') :
        """
            Load the logfiles at the given path.
            
            The files contain data in the given charset, and are named according the the date in the given timezone and
            date format, and will be parsed using the given parser.
        """

        # store
        self.path = path
        self.tz = tz
        self.parser = parser
        self.charset = charset
        self.filename_fmt = filename_fmt

    def _get_logfile_datetime (self, dt) :
        """
            Get the logfile corresponding to the given datetime
        """

        # convert to target timezone
        dtz = dt.astimezone(self.tz)
        
        # convert to date and use that
        return self._get_logfile_date(dtz.date())

    def _get_logfile_date (self, d) :
        """
            Get the logfile corresponding to the given naive date in our timezone
        """

        # format filename
        filename = d.strftime(self.filename_fmt)

        # build path
        path = os.path.join(self.path, filename)

        # return the LogFile
        return LogFile(path, self.parser, d, self.charset)
    
    def _iter_date_reverse (self, dt=None) :
        """
            Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the
            given *datetime*, or the the current date, if none given
        """
        
        # default to now
        if not dt :
            dt = datetime.datetime.now(pytz.utc)
        
        # convert to target timezone
        dtz = dt.astimezone(self.tz)

        # our timedelta
        ONE_DAY = datetime.timedelta(1)
        
        # iterate unto infinity
        while True :
            # yield
            yield dtz.date()
            
            # one day sdrawkcab
            dtz -= ONE_DAY
    
    def get_latest (self, count) :
        """
            Uses _iter_backwards + _get_logfile_date to read the yield the given lines from as many logfiles as needed
        """
        
        # iterate backwards from now
        day_iter = self._iter_date_reverse()

        # number of files read
        files = 0

        # only read up to 100 files or so
        MAX_FILES = 100

        # read the events into here
        lines = []
        
        # loop until done
        while len(lines) < count :
            logfile = None

            try :
                # get next logfile
                files += 1
                
                # open
                logfile = self._get_logfile_date(day_iter.next())
            
            except IOError, e :
                # skip nonexistant days if we haven't found any logs yet
                if e.errno != errno.ENOENT :
                    raise

                if files > MAX_FILES :
                    raise Exception("No recent logfiles found")
                
                else :
                    # skip to next day
                    continue
            
            # read the events
            # XXX: use a queue
            lines = list(logfile.get_latest(count)) + lines
        
        # return the events
        return lines

    def get_date (self, dt) :
        """
            A 'day' is considered to be a 24-hour period from 00:00:00 23:59:59. If the timezone of the given datetime
            differs from our native datetime, this may involve lines from more than one logfile.
        """

        # begin/end of 24h period, in target timezone
        dtz_begin = dt.replace(hour=0, minute=0, second=0).astimezone(self.tz)
        dtz_end = dt.replace(hour=23, minute=59, second=59, microsecond=999999).astimezone(self.tz)

        # as dates
        d_begin = dtz_begin.date() 
        d_end = dtz_end.date()

        # if they're the same, just pull the full log for that date
        if d_begin == d_end :
            return self._get_logfile_date(d_begin).read_full()
        
        # otherwise, we need to pull two partial logs
        else :
            # open both of them
            f_begin = self._get_logfile_date(d_begin)
            f_end = self._get_logfile_date(d_end)
            
            # chain together the two sources
            return itertools.chain(f_begin.read_from(dtz_begin), f_end.read_until(dtz_end))