log_source.py
changeset 140 6db2527b67cf
parent 139 9c7769850195
child 141 65c98c9e1716
--- a/log_source.py	Sun Sep 13 00:49:55 2009 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,679 +0,0 @@
-"""
-    A source of IRC log files
-"""
-
-import datetime, calendar, itertools, functools, math
-import os, os.path, errno
-import pytz
-
-import config, utils
-
-# a timedelta that represents one day
-ONE_DAY = datetime.timedelta(days=1)
-
-class LogSourceDecoder (object) :
-    """
-        Handles decoding of LogSource lines
-    """
-
-    def __init__ (self, encoding_list) :
-        """
-            Will try each of the given (charset, errors) items in turn, until one succeeds
-        """
-
-        self.encoding_list = encoding_list
-    
-    def decode (self, line) :
-        """
-            Decode the line of str() text into an unicode object
-        """
-        
-        # list of errors encountered
-        error_list = []
-        
-        # try each in turn
-        for charset, errors in self.encoding_list :
-            # trap UnicodeDecodeError to try with the next one
-            try :
-                return line.decode(charset, errors)
-
-            except UnicodeDecodeError, e :
-                error_list.append("%s:%s - %s" % (charset, errors, e))
-                continue
-
-        # failure
-        raise UnicodeDecodeError("Failed to decode line: %r: %s" % (line, ', '.join(error_list)))
-
-class LogSource (object) :
-    """
-        A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
-    """
-    
-    def __init__ (self, decoder, channel=None) :
-        """
-            The appropriate LogChannel must be given, as we need to be able to construct the LogLines. If it is not yet
-            known, then it can be given as None, and set later with bind_channel.
-
-            Uses the given LogSourceDecoder to decode the lines.
-        """
-        
-        self.channel = channel
-        self.decoder = decoder
-    
-    def bind_channel (self, channel) :
-        """
-            Set this source's channel, where None was set before
-        """
-
-        assert not self.channel
-
-        self.channel = channel
-
-    def get_latest (self, count) :
-        """
-            Yield the latest events, up to `count` of them.
-        """
-
-        abstract
-    
-    def get_date (self, dt) :
-        """
-            Get logs for the given date (as a datetime).
-        """
-
-        abstract
-    
-    def get_date_paged (self, dt, count, page=None) :
-        """
-            Get the logs for a given date (as a datetime), divided into pages of count each. If page is given, the time
-            portion of the dt is ignored, and the lines for the given page are returned. Otherwise, if page is None,
-            then the lines for the page containing the given timestamp is returned.
-
-            The return value is a (page, max, lines) tuple.
-        """
-        
-        # how to act?
-        if page :
-            # constant skip
-            skip = (page - 1) * count
-
-        else :
-            skip = None
-
-            # go through the logs a page at a time
-            this_page = 1
-
-            # last line's timestamp
-            last_ts = None
-
-            # found it yet?
-            found = False
-
-        # count the full number of lines
-        line_count = 0
-
-        # collect lines
-        lines = []
-
-        # iterate using get_date
-        for line in self.get_date(dt) :
-            # count them
-            line_count += 1
-
-            # skip?
-            if skip :
-                skip -= 1
-                continue
-            
-            # is this page all that we want/need?
-            if page or found :
-                # already full?
-                if len(lines) >= count :
-                    continue
-
-            # specfic timestamp
-            else :
-                # didn't find it in this page?
-                if len(lines) >= count :
-                    # reset to next page
-                    lines = []
-                    this_page += 1
-
-                # is dt between these two timestamps?
-                if (not last_ts or last_ts <= dt) and (dt <= line.timestamp) :
-                    # found!
-                    found = True
-                    page = this_page
-
-                else :
-                    # keep looking
-                    last_ts = line.timestamp
-
-            # store line
-            lines.append(line)
-        
-        # calculate max_pages
-        max_pages = math.ceil(float(line_count) / count)
-        
-        # return
-        return (page, max_pages, lines)
-
-    def get_month_days (self, dt) :
-        """
-            Return an ordered sequence of dates, telling which days in the given month (as a datetime) have logs available.
-        """
-
-        abstract
-    
-    def get_modified (self, dt=None, after=None, until=None) :
-        """
-            Returns a sequence of LogLines that may have been *modified* from their old values since the given datetime.
-
-            If the datetime is not given, *all* lines are returned.
-
-            If after is given, only lines from said date onwards will be returned, regardless of modification.
-            If until is given, only lines up to and including said date will be returned, regardless of modification.
-
-            The LogLines should be in time order.
-        """
-
-        abstract
-    
-    def get_prev_date (self, dt) :
-        """
-            Get the next distinct date of logs available preceeding the given date, or None
-        """
-
-        abstract
-
-    def get_next_date (self, dt) :
-        """
-            Get the next distinct date of logs following the given date, or None.
-        """
-        
-        abstract
-
-class LogFile (object) :
-    """
-        A file containing LogEvents
-
-        XXX: modify to implement LogSource?
-    """
-
-    def __init__ (self, path, parser, decoder, channel=None, start_date=None, sep='\n') :
-        """
-            Open the file at the given path, which contains lines as separated by the given separator. Lines are
-            decoded using the given LogSourceDecoder, and then parsed using the given parser, using the given date
-            as the initial date for this log's first line.
-            
-            XXX: currently we assume start_date also for the end of the file
-        """
-        
-        # store
-        self.channel = channel
-        self.path = path
-        self.parser = parser
-        self.start_date = start_date
-        self.decoder = decoder
-        self.sep = sep
-
-        # open
-        self.file = open(path, 'rb')
-
-    def __iter__ (self) :
-        """
-            Yields a series of unicode lines, as read from the top of the file
-        """
-        
-        # seek to beginning
-        self.file.seek(0)
-
-        # iterate over lines, decoding them as well
-        return (self.decoder.decode(line.rstrip(self.sep)) for line in self.file)
-    
-    def read_full (self) :
-        """
-            Reads all LogLines. The LogLines will have a valid offset.
-        """
-        
-        # just use our __iter__
-        return self.parser.parse_lines(self.channel, self, self.start_date, starting_offset=1)
-
-    def read_from (self, dt) :
-        """
-            Reads all LogLines from the given naive timestamp onwards
-        """
-        
-        # start reading at beginning
-        events = self.read_full()
-        
-        # skip unwanted events
-        for event in events :
-            if event.timestamp < dt :
-                continue
-
-            else :
-                # include this line as well
-                yield event
-                break
-        
-        # yield the rest as-is
-        for event in events :
-            yield event
-
-    def read_until (self, dt) :
-        """
-            Reads all LogLines up until the given naive timestamp
-        """
-
-        # start reading events at the beginning
-        events = self.read_full()
-
-        # yield events until we hit the given timestamp
-        for event in events :
-            if event.timestamp <= dt :
-                yield event
-
-            else :
-                break
-            
-        # ignore the rest
-        return
-
-    def _read_blocks_reverse (self, blocksize=1024) :
-        """
-            Yields blocks of file data in reverse order, starting at the end of the file
-        """
-
-        # seek to end of file
-        self.file.seek(0, os.SEEK_END)
-
-        # read offset
-        # XXX: hack -1 to get rid of trailing newline
-        size = offset = self.file.tell() - 1
-        
-        # do not try to read past the beginning of the file
-        while offset > 0:
-            # calc new offset + size
-            if offset > blocksize :
-                # full block
-                offset -= blocksize
-                read_size = blocksize
-
-            else :
-                # partial block
-                read_size = offset
-                offset = 0
-
-            # seek to offset
-            self.file.seek(offset)
-
-            # read the data we want
-            block = self.file.read(read_size)
-
-            # sanity check
-            assert len(block) == read_size
-
-            # yield 
-            yield block
-    
-    def _read_lines_reverse (self) :
-        """
-            Yields decoded lines from the end of the file, in reverse order.
-        """
-
-        # partial lines
-        buf = ''
-        
-        # read from end of file, a block at a time
-        for block in self._read_blocks_reverse() :
-            # add in our previous buf
-            buf = block + buf
-            
-            # split up lines
-            lines = buf.split(self.sep)
-
-            # keep the first one as our buffer, as it's incomplete
-            buf = lines[0]
-           
-            # yield the rest a line at a time in reverse order... this looks weird, but that's how slicing works :)
-            # XXX: use something like islice, this has to build a slice object
-            for line in lines[:0:-1] :
-                yield self.decoder.decode(line)
-
-    def read_latest (self, count) :
-        """
-            Returns up to count events, from the end of the file, or less, if the file doesn't contain that many lines.
-        """
-
-        # the list of lines
-        lines = []
-
-        # start reading lines into lines
-        for line in self._read_lines_reverse() :
-            # append
-            lines.append(line)
-
-            # done?
-            if len(lines) >= count :
-                break
-        
-        # decode in reverse order, using our starting date....
-        # XXX: use lines[::-1] or reversed?
-        # XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that
-        return self.parser.parse_lines(self.channel, reversed(lines), self.start_date)
-
-class LogDirectory (LogSource) :
-    """
-        A directory containing a series of timestamped LogFiles
-    """
-
-    def __init__ (self, path, tz, parser, decoder, filename_fmt, channel=None) :
-        """
-            Load the logfiles at the given path, which are for the given LogChannel
-            
-            Decode the file lines using the given decoder, the files are named according the the date in the given
-            timezone and date format, and will be parsed using the given parser.
-        """
-
-        # store
-        self.channel = channel
-        self.path = path
-        self.tz = tz
-        self.parser = parser
-        self.decoder = decoder
-        self.filename_fmt = filename_fmt
-
-    def _get_logfile_date (self, d, load=True, mtime=False, ignore_missing=False) :
-        """
-            Get the logfile corresponding to the given naive date in our timezone. 
-            
-            If load is False, only test for the presence of the logfile, do not actually open it. If mtime is given,
-            then this returns the file's mtime
-
-            Returns None if the logfile does not exist, unless ignore_missing is given as False.
-        """
-
-        # format filename
-        filename = d.strftime(self.filename_fmt)
-
-        # build path
-        path = os.path.join(self.path, filename)
-        
-        try :
-            if load :
-                # open+return the LogFile
-                return LogFile(path, self.parser, self.decoder, start_date=d, channel=self.channel)
-            
-            elif mtime :
-                # stat
-                return utils.mtime(path)
-
-            else :
-                # test
-                return os.path.exists(path)
-
-        # XXX: move to LogFile
-        except IOError, e :
-            # return None for missing files
-            if e.errno == errno.ENOENT and ignore_missing :
-                return None
-
-            else :
-                raise
-    
-    def _iter_logfile_dates (self, after=None, until=None, reverse=False) :
-        """
-            Yields a series of naive datetime objects representing the logfiles that are available, in time order.
-            
-            Parameters :
-                after   only dates from said date onwards will be returned
-                until   only dates up to and including said date will be returned
-                reverse the dates are returned in reverse order instead. Note that the meaning of after/until doesn't change
-        """
-
-        # convert timestamps to our timezone's dates
-        if after :
-            after = after.astimezone(self.tz).date()
-
-        if until :
-            until = until.astimezone(self.tz).date()
-
-        # listdir
-        filenames = os.listdir(self.path)
-
-        # sort
-        filenames.sort(reverse=reverse)
-
-        # iter files
-        for filename in filenames :
-            try :
-                # parse date
-                dt = self.tz.localize(datetime.datetime.strptime(filename, self.filename_fmt))
-                date = dt.date()
-            
-            except :
-                # ignore
-                continue
-
-            else :
-                if (after and date < after) or (until and date > until) :
-                    # ignore
-                    continue
-                
-                else :
-                    # yield
-                    yield dt
-            
-    def _iter_date_reverse (self, dt=None) :
-        """
-            Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the
-            given *datetime*, or the the current date, if none given
-        """
-        
-        # default to now
-        if not dt :
-            dtz = self.tz.localize(datetime.datetime.now())
-
-        else :
-            # convert to target timezone
-            dtz = dt.astimezone(self.tz)
-
-        # iterate unto infinity
-        while True :
-            # yield
-            yield dtz.date()
-            
-            # one day sdrawkcab
-            dtz -= ONE_DAY
-    
-    def _iter_logfile_reverse (self, dt=None, max_files=100) :
-        """
-            Yields a series of LogFile objects, iterating backwards in time starting at the given datetime, or the
-            current date, if none given.
-
-            Reads/probes at most max_files files.
-        """
-        
-        # start counting at zero...
-        file_count = 0
-
-        # have we found any files at all so far?
-        have_found = False
-
-        # iterate backwards over days
-        for day in self._iter_date_reverse(dt) :
-            # stop if we've handled enough files by now
-            if file_count > max_files :
-                break
-            
-            # try and open the next logfile
-            logfile = None
-            
-            file_count += 1
-            logfile = self._get_logfile_date(day, ignore_missing=True)
-            
-            # no logfile there?
-            if not logfile :
-                # hit our limit?
-                if file_count > max_files :
-                    # if we didn't find any logfiles at all, terminate rudely
-                    if not have_found :
-                        raise Exception("No recent logfiles found")
-                    
-                    else :
-                        # stop looking, deal with what we've got
-                        return
-
-                else :
-                    # skip to next day
-                    continue
-            
-            # mark have_found
-            have_found = True
-
-            # yield it
-            yield logfile
-
-    def get_latest (self, count) :
-        """
-            Uses _logfile_reverse to read the yield the given lines from as many logfiles as needed
-        """
-
-        # read the events into here
-        lines = []
-        
-        # start reading in those logfiles
-        for logfile in self._iter_logfile_reverse() :
-            # read the events
-            # XXX: use a queue
-            lines = list(logfile.read_latest(count)) + lines
-
-            # done?
-            if len(lines) >= count :
-                break
-        
-        # return the events
-        return lines
-
-    def get_date (self, dt) :
-        """
-            A 'day' is considered to be a 24-hour period from 00:00:00 23:59:59. If the timezone of the given datetime
-            differs from our native datetime, this may involve lines from more than one logfile.
-        """
-
-        # begin/end of 24h period, in target timezone
-        dtz_begin = dt.replace(hour=0, minute=0, second=0).astimezone(self.tz)
-        dtz_end = dt.replace(hour=23, minute=59, second=59, microsecond=999999).astimezone(self.tz)
-
-        # as dates
-        d_begin = dtz_begin.date() 
-        d_end = dtz_end.date()
-        
-#        print
-#        print "LogDirectory.get_date - %s" % dt
-#        print "\t   %s %s" % (d_begin, dtz_begin)
-#        print "\t-> %s %s" % (d_end, dtz_end)
-
-        # if they're the same, just pull the full log for that date
-        if d_begin == d_end :
-            # open that log
-            logfile = self._get_logfile_date(d_begin)
-            
-            # return the full data
-            return logfile.read_full()
-        
-        # otherwise, we need to pull two partial logs
-        else :
-            # open both of them, but it's okay if we don't have the second one
-            f_begin = self._get_logfile_date(d_begin)
-            f_end = self._get_logfile_date(d_end, ignore_missing=True)
-
-            # chain together the two sources
-            return itertools.chain(
-                f_begin.read_from(dtz_begin), 
-                f_end.read_until(dtz_end) if f_end else []
-            )
-    
-    def _iter_month_days (self, month) :
-        """
-            Iterates over the days of a month as dt objects with time=0
-        """
-        
-        # there's at most 31 days in a month...
-        for day in xrange(1, 32) :
-            try :
-                # try and build the datetime
-                dt = datetime.datetime(month.year, month.month, day)
-
-            except :
-                # stop
-                return
-            
-            else :
-                # fix timezones + yield
-                yield month.tzinfo.localize(dt)
-
-    def get_month_days (self, month) :
-        """
-            Returns a set of dates for which logfiles are available in the given datetime's month
-        """
-
-        # iterate over month's days
-        for dt in self._iter_month_days(month) :
-            # date in our target timezone
-            log_date = dt.astimezone(self.tz).date()
-            
-            # test for it
-            if self._get_logfile_date(log_date, load=False, ignore_missing=True) :
-                # valid
-                yield dt.date()
-
-    def get_modified (self, dt=None, after=None, until=None) :
-        """
-            Returns the contents off all logfiles with mtimes past the given date
-        """
-
-        # iterate through all available logfiles in date order, as datetimes, from the given date on
-        for log_date in self._iter_logfile_dates(after, until) :
-            # compare against dt?
-            if dt :
-                # stat
-                mtime = self._get_logfile_date(log_date, load=False, mtime=True, ignore_missing=True)
-                
-                # not modified?
-                if mtime < dt :
-                    # skip
-                    continue
-                
-            # open
-            logfile = self._get_logfile_date(log_date)
-
-            # yield all lines
-            for line in logfile.read_full() :
-                yield line
-
-    def get_prev_date (self, dt) :
-        """
-            Just use _iter_logfile_dates
-        """
-        
-        # use for to "iter" once
-        for log_date in self._iter_logfile_dates(until=dt - ONE_DAY, reverse=True) :
-            return log_date
-        
-        else :
-            return None
-
-    def get_next_date (self, dt) :
-        """
-            Just use _iter_logfile_dates
-        """
-        
-        # use for to "iter" once
-        for log_date in self._iter_logfile_dates(after=dt + ONE_DAY) :
-            return log_date
-        
-        else :
-            return None
-