qmsk/irclogs/log_source.py
changeset 140 6db2527b67cf
parent 115 751e3fcd11d2
child 141 65c98c9e1716
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qmsk/irclogs/log_source.py	Sun Sep 13 01:15:56 2009 +0300
@@ -0,0 +1,679 @@
+"""
+    A source of IRC log files
+"""
+
+import datetime, calendar, itertools, functools, math
+import os, os.path, errno
+import pytz
+
+import config, utils
+
+# a timedelta that represents one day
+ONE_DAY = datetime.timedelta(days=1)
+
+class LogSourceDecoder (object) :
+    """
+        Handles decoding of LogSource lines
+    """
+
+    def __init__ (self, encoding_list) :
+        """
+            Will try each of the given (charset, errors) items in turn, until one succeeds
+        """
+
+        self.encoding_list = encoding_list
+    
+    def decode (self, line) :
+        """
+            Decode the line of str() text into an unicode object
+        """
+        
+        # list of errors encountered
+        error_list = []
+        
+        # try each in turn
+        for charset, errors in self.encoding_list :
+            # trap UnicodeDecodeError to try with the next one
+            try :
+                return line.decode(charset, errors)
+
+            except UnicodeDecodeError, e :
+                error_list.append("%s:%s - %s" % (charset, errors, e))
+                continue
+
+        # failure
+        raise UnicodeDecodeError("Failed to decode line: %r: %s" % (line, ', '.join(error_list)))
+
+class LogSource (object) :
+    """
+        A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
+    """
+    
+    def __init__ (self, decoder, channel=None) :
+        """
+            The appropriate LogChannel must be given, as we need to be able to construct the LogLines. If it is not yet
+            known, then it can be given as None, and set later with bind_channel.
+
+            Uses the given LogSourceDecoder to decode the lines.
+        """
+        
+        self.channel = channel
+        self.decoder = decoder
+    
+    def bind_channel (self, channel) :
+        """
+            Set this source's channel, where None was set before
+        """
+
+        assert not self.channel
+
+        self.channel = channel
+
+    def get_latest (self, count) :
+        """
+            Yield the latest events, up to `count` of them.
+        """
+
+        abstract
+    
+    def get_date (self, dt) :
+        """
+            Get logs for the given date (as a datetime).
+        """
+
+        abstract
+    
+    def get_date_paged (self, dt, count, page=None) :
+        """
+            Get the logs for a given date (as a datetime), divided into pages of count each. If page is given, the time
+            portion of the dt is ignored, and the lines for the given page are returned. Otherwise, if page is None,
+            then the lines for the page containing the given timestamp is returned.
+
+            The return value is a (page, max, lines) tuple.
+        """
+        
+        # how to act?
+        if page :
+            # constant skip
+            skip = (page - 1) * count
+
+        else :
+            skip = None
+
+            # go through the logs a page at a time
+            this_page = 1
+
+            # last line's timestamp
+            last_ts = None
+
+            # found it yet?
+            found = False
+
+        # count the full number of lines
+        line_count = 0
+
+        # collect lines
+        lines = []
+
+        # iterate using get_date
+        for line in self.get_date(dt) :
+            # count them
+            line_count += 1
+
+            # skip?
+            if skip :
+                skip -= 1
+                continue
+            
+            # is this page all that we want/need?
+            if page or found :
+                # already full?
+                if len(lines) >= count :
+                    continue
+
+            # specfic timestamp
+            else :
+                # didn't find it in this page?
+                if len(lines) >= count :
+                    # reset to next page
+                    lines = []
+                    this_page += 1
+
+                # is dt between these two timestamps?
+                if (not last_ts or last_ts <= dt) and (dt <= line.timestamp) :
+                    # found!
+                    found = True
+                    page = this_page
+
+                else :
+                    # keep looking
+                    last_ts = line.timestamp
+
+            # store line
+            lines.append(line)
+        
+        # calculate max_pages
+        max_pages = math.ceil(float(line_count) / count)
+        
+        # return
+        return (page, max_pages, lines)
+
+    def get_month_days (self, dt) :
+        """
+            Return an ordered sequence of dates, telling which days in the given month (as a datetime) have logs available.
+        """
+
+        abstract
+    
+    def get_modified (self, dt=None, after=None, until=None) :
+        """
+            Returns a sequence of LogLines that may have been *modified* from their old values since the given datetime.
+
+            If the datetime is not given, *all* lines are returned.
+
+            If after is given, only lines from said date onwards will be returned, regardless of modification.
+            If until is given, only lines up to and including said date will be returned, regardless of modification.
+
+            The LogLines should be in time order.
+        """
+
+        abstract
+    
+    def get_prev_date (self, dt) :
+        """
+            Get the next distinct date of logs available preceeding the given date, or None
+        """
+
+        abstract
+
+    def get_next_date (self, dt) :
+        """
+            Get the next distinct date of logs following the given date, or None.
+        """
+        
+        abstract
+
+class LogFile (object) :
+    """
+        A file containing LogEvents
+
+        XXX: modify to implement LogSource?
+    """
+
+    def __init__ (self, path, parser, decoder, channel=None, start_date=None, sep='\n') :
+        """
+            Open the file at the given path, which contains lines as separated by the given separator. Lines are
+            decoded using the given LogSourceDecoder, and then parsed using the given parser, using the given date
+            as the initial date for this log's first line.
+            
+            XXX: currently we assume start_date also for the end of the file
+        """
+        
+        # store
+        self.channel = channel
+        self.path = path
+        self.parser = parser
+        self.start_date = start_date
+        self.decoder = decoder
+        self.sep = sep
+
+        # open
+        self.file = open(path, 'rb')
+
+    def __iter__ (self) :
+        """
+            Yields a series of unicode lines, as read from the top of the file
+        """
+        
+        # seek to beginning
+        self.file.seek(0)
+
+        # iterate over lines, decoding them as well
+        return (self.decoder.decode(line.rstrip(self.sep)) for line in self.file)
+    
+    def read_full (self) :
+        """
+            Reads all LogLines. The LogLines will have a valid offset.
+        """
+        
+        # just use our __iter__
+        return self.parser.parse_lines(self.channel, self, self.start_date, starting_offset=1)
+
+    def read_from (self, dt) :
+        """
+            Reads all LogLines from the given naive timestamp onwards
+        """
+        
+        # start reading at beginning
+        events = self.read_full()
+        
+        # skip unwanted events
+        for event in events :
+            if event.timestamp < dt :
+                continue
+
+            else :
+                # include this line as well
+                yield event
+                break
+        
+        # yield the rest as-is
+        for event in events :
+            yield event
+
+    def read_until (self, dt) :
+        """
+            Reads all LogLines up until the given naive timestamp
+        """
+
+        # start reading events at the beginning
+        events = self.read_full()
+
+        # yield events until we hit the given timestamp
+        for event in events :
+            if event.timestamp <= dt :
+                yield event
+
+            else :
+                break
+            
+        # ignore the rest
+        return
+
+    def _read_blocks_reverse (self, blocksize=1024) :
+        """
+            Yields blocks of file data in reverse order, starting at the end of the file
+        """
+
+        # seek to end of file
+        self.file.seek(0, os.SEEK_END)
+
+        # read offset
+        # XXX: hack -1 to get rid of trailing newline
+        size = offset = self.file.tell() - 1
+        
+        # do not try to read past the beginning of the file
+        while offset > 0:
+            # calc new offset + size
+            if offset > blocksize :
+                # full block
+                offset -= blocksize
+                read_size = blocksize
+
+            else :
+                # partial block
+                read_size = offset
+                offset = 0
+
+            # seek to offset
+            self.file.seek(offset)
+
+            # read the data we want
+            block = self.file.read(read_size)
+
+            # sanity check
+            assert len(block) == read_size
+
+            # yield 
+            yield block
+    
+    def _read_lines_reverse (self) :
+        """
+            Yields decoded lines from the end of the file, in reverse order.
+        """
+
+        # partial lines
+        buf = ''
+        
+        # read from end of file, a block at a time
+        for block in self._read_blocks_reverse() :
+            # add in our previous buf
+            buf = block + buf
+            
+            # split up lines
+            lines = buf.split(self.sep)
+
+            # keep the first one as our buffer, as it's incomplete
+            buf = lines[0]
+           
+            # yield the rest a line at a time in reverse order... this looks weird, but that's how slicing works :)
+            # XXX: use something like islice, this has to build a slice object
+            for line in lines[:0:-1] :
+                yield self.decoder.decode(line)
+
+    def read_latest (self, count) :
+        """
+            Returns up to count events, from the end of the file, or less, if the file doesn't contain that many lines.
+        """
+
+        # the list of lines
+        lines = []
+
+        # start reading lines into lines
+        for line in self._read_lines_reverse() :
+            # append
+            lines.append(line)
+
+            # done?
+            if len(lines) >= count :
+                break
+        
+        # decode in reverse order, using our starting date....
+        # XXX: use lines[::-1] or reversed?
+        # XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that
+        return self.parser.parse_lines(self.channel, reversed(lines), self.start_date)
+
+class LogDirectory (LogSource) :
+    """
+        A directory containing a series of timestamped LogFiles
+    """
+
+    def __init__ (self, path, tz, parser, decoder, filename_fmt, channel=None) :
+        """
+            Load the logfiles at the given path, which are for the given LogChannel
+            
+            Decode the file lines using the given decoder, the files are named according the the date in the given
+            timezone and date format, and will be parsed using the given parser.
+        """
+
+        # store
+        self.channel = channel
+        self.path = path
+        self.tz = tz
+        self.parser = parser
+        self.decoder = decoder
+        self.filename_fmt = filename_fmt
+
+    def _get_logfile_date (self, d, load=True, mtime=False, ignore_missing=False) :
+        """
+            Get the logfile corresponding to the given naive date in our timezone. 
+            
+            If load is False, only test for the presence of the logfile, do not actually open it. If mtime is given,
+            then this returns the file's mtime
+
+            Returns None if the logfile does not exist, unless ignore_missing is given as False.
+        """
+
+        # format filename
+        filename = d.strftime(self.filename_fmt)
+
+        # build path
+        path = os.path.join(self.path, filename)
+        
+        try :
+            if load :
+                # open+return the LogFile
+                return LogFile(path, self.parser, self.decoder, start_date=d, channel=self.channel)
+            
+            elif mtime :
+                # stat
+                return utils.mtime(path)
+
+            else :
+                # test
+                return os.path.exists(path)
+
+        # XXX: move to LogFile
+        except IOError, e :
+            # return None for missing files
+            if e.errno == errno.ENOENT and ignore_missing :
+                return None
+
+            else :
+                raise
+    
+    def _iter_logfile_dates (self, after=None, until=None, reverse=False) :
+        """
+            Yields a series of naive datetime objects representing the logfiles that are available, in time order.
+            
+            Parameters :
+                after   only dates from said date onwards will be returned
+                until   only dates up to and including said date will be returned
+                reverse the dates are returned in reverse order instead. Note that the meaning of after/until doesn't change
+        """
+
+        # convert timestamps to our timezone's dates
+        if after :
+            after = after.astimezone(self.tz).date()
+
+        if until :
+            until = until.astimezone(self.tz).date()
+
+        # listdir
+        filenames = os.listdir(self.path)
+
+        # sort
+        filenames.sort(reverse=reverse)
+
+        # iter files
+        for filename in filenames :
+            try :
+                # parse date
+                dt = self.tz.localize(datetime.datetime.strptime(filename, self.filename_fmt))
+                date = dt.date()
+            
+            except :
+                # ignore
+                continue
+
+            else :
+                if (after and date < after) or (until and date > until) :
+                    # ignore
+                    continue
+                
+                else :
+                    # yield
+                    yield dt
+            
+    def _iter_date_reverse (self, dt=None) :
+        """
+            Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the
+            given *datetime*, or the the current date, if none given
+        """
+        
+        # default to now
+        if not dt :
+            dtz = self.tz.localize(datetime.datetime.now())
+
+        else :
+            # convert to target timezone
+            dtz = dt.astimezone(self.tz)
+
+        # iterate unto infinity
+        while True :
+            # yield
+            yield dtz.date()
+            
+            # one day sdrawkcab
+            dtz -= ONE_DAY
+    
+    def _iter_logfile_reverse (self, dt=None, max_files=100) :
+        """
+            Yields a series of LogFile objects, iterating backwards in time starting at the given datetime, or the
+            current date, if none given.
+
+            Reads/probes at most max_files files.
+        """
+        
+        # start counting at zero...
+        file_count = 0
+
+        # have we found any files at all so far?
+        have_found = False
+
+        # iterate backwards over days
+        for day in self._iter_date_reverse(dt) :
+            # stop if we've handled enough files by now
+            if file_count > max_files :
+                break
+            
+            # try and open the next logfile
+            logfile = None
+            
+            file_count += 1
+            logfile = self._get_logfile_date(day, ignore_missing=True)
+            
+            # no logfile there?
+            if not logfile :
+                # hit our limit?
+                if file_count > max_files :
+                    # if we didn't find any logfiles at all, terminate rudely
+                    if not have_found :
+                        raise Exception("No recent logfiles found")
+                    
+                    else :
+                        # stop looking, deal with what we've got
+                        return
+
+                else :
+                    # skip to next day
+                    continue
+            
+            # mark have_found
+            have_found = True
+
+            # yield it
+            yield logfile
+
+    def get_latest (self, count) :
+        """
+            Uses _logfile_reverse to read the yield the given lines from as many logfiles as needed
+        """
+
+        # read the events into here
+        lines = []
+        
+        # start reading in those logfiles
+        for logfile in self._iter_logfile_reverse() :
+            # read the events
+            # XXX: use a queue
+            lines = list(logfile.read_latest(count)) + lines
+
+            # done?
+            if len(lines) >= count :
+                break
+        
+        # return the events
+        return lines
+
+    def get_date (self, dt) :
+        """
+            A 'day' is considered to be a 24-hour period from 00:00:00 23:59:59. If the timezone of the given datetime
+            differs from our native datetime, this may involve lines from more than one logfile.
+        """
+
+        # begin/end of 24h period, in target timezone
+        dtz_begin = dt.replace(hour=0, minute=0, second=0).astimezone(self.tz)
+        dtz_end = dt.replace(hour=23, minute=59, second=59, microsecond=999999).astimezone(self.tz)
+
+        # as dates
+        d_begin = dtz_begin.date() 
+        d_end = dtz_end.date()
+        
+#        print
+#        print "LogDirectory.get_date - %s" % dt
+#        print "\t   %s %s" % (d_begin, dtz_begin)
+#        print "\t-> %s %s" % (d_end, dtz_end)
+
+        # if they're the same, just pull the full log for that date
+        if d_begin == d_end :
+            # open that log
+            logfile = self._get_logfile_date(d_begin)
+            
+            # return the full data
+            return logfile.read_full()
+        
+        # otherwise, we need to pull two partial logs
+        else :
+            # open both of them, but it's okay if we don't have the second one
+            f_begin = self._get_logfile_date(d_begin)
+            f_end = self._get_logfile_date(d_end, ignore_missing=True)
+
+            # chain together the two sources
+            return itertools.chain(
+                f_begin.read_from(dtz_begin), 
+                f_end.read_until(dtz_end) if f_end else []
+            )
+    
+    def _iter_month_days (self, month) :
+        """
+            Iterates over the days of a month as dt objects with time=0
+        """
+        
+        # there's at most 31 days in a month...
+        for day in xrange(1, 32) :
+            try :
+                # try and build the datetime
+                dt = datetime.datetime(month.year, month.month, day)
+
+            except :
+                # stop
+                return
+            
+            else :
+                # fix timezones + yield
+                yield month.tzinfo.localize(dt)
+
+    def get_month_days (self, month) :
+        """
+            Returns a set of dates for which logfiles are available in the given datetime's month
+        """
+
+        # iterate over month's days
+        for dt in self._iter_month_days(month) :
+            # date in our target timezone
+            log_date = dt.astimezone(self.tz).date()
+            
+            # test for it
+            if self._get_logfile_date(log_date, load=False, ignore_missing=True) :
+                # valid
+                yield dt.date()
+
+    def get_modified (self, dt=None, after=None, until=None) :
+        """
+            Returns the contents off all logfiles with mtimes past the given date
+        """
+
+        # iterate through all available logfiles in date order, as datetimes, from the given date on
+        for log_date in self._iter_logfile_dates(after, until) :
+            # compare against dt?
+            if dt :
+                # stat
+                mtime = self._get_logfile_date(log_date, load=False, mtime=True, ignore_missing=True)
+                
+                # not modified?
+                if mtime < dt :
+                    # skip
+                    continue
+                
+            # open
+            logfile = self._get_logfile_date(log_date)
+
+            # yield all lines
+            for line in logfile.read_full() :
+                yield line
+
+    def get_prev_date (self, dt) :
+        """
+            Just use _iter_logfile_dates
+        """
+        
+        # use for to "iter" once
+        for log_date in self._iter_logfile_dates(until=dt - ONE_DAY, reverse=True) :
+            return log_date
+        
+        else :
+            return None
+
+    def get_next_date (self, dt) :
+        """
+            Just use _iter_logfile_dates
+        """
+        
+        # use for to "iter" once
+        for log_date in self._iter_logfile_dates(after=dt + ONE_DAY) :
+            return log_date
+        
+        else :
+            return None
+