log_source.py
changeset 50 f13cf27a360b
parent 48 7858b7b8ffe3
child 54 b65a95eb9f6b
--- a/log_source.py	Sun Feb 08 04:59:22 2009 +0200
+++ b/log_source.py	Mon Feb 09 00:24:13 2009 +0200
@@ -2,13 +2,10 @@
     A source of IRC log files
 """
 
-import codecs
-from datetime import date, datetime, timedelta
+import datetime, itertools
+import os, errno
 import pytz
 
-# for SEEK_*, errno
-import os, errno
-
 class LogSource (object) :
     """
         A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
@@ -20,19 +17,30 @@
         """
 
         abstract
+    
+    def get_date (self, dt) :
+        """
+            Get logs for the given date (as a datetime)
+        """
+
+        abstract
 
 class LogFile (LogSource) :
     """
         A file containing LogEvents
     """
 
-    def __init__ (self, path, charset='utf-8', sep='\n') :
+    def __init__ (self, path, parser, start_date=None, charset='utf-8', sep='\n') :
         """
-            Open the file at the given path, which contains data of the given codec, as lines separated by the given separator
+            Open the file at the given path, which contains data with the given charset, as lines separated by the
+            given separator. Lines are parsed using the given parser, using the given date as an initial date, see
+            LogParser for more info. XXX: currently we assume start_date also for the end of the file
         """
         
         # store
         self.path = path
+        self.parser = parser
+        self.start_date = start_date
         self.charset = charset
         self.sep = sep
 
@@ -41,22 +49,68 @@
     
     def __iter__ (self) :
         """
-            Yields a series of lines, as read from the top of the file
+            Yields a series of unicode lines, as read from the top of the file
         """
         
         # seek to beginning
         self.file.seek(0)
 
-        # iterate over lines
-        return iter(self.file)
+        # iterate over lines, decoding them as well
+        return (line.decode(self.charset) for line in self.file)
     
-    def get_latest (self, count) :
+    def read_full (self) :
         """
-            Returns up to <count> lines from the end of the file, or less, if the file doesn't contain that many lines
+            Reads all LogLines
+        """
+        
+        # just use our __iter__
+        return self.parser.parse_lines(self, self.start_date)
+
+    def read_from (self, dt) :
+        """
+            Reads all LogLines from the given naive timestamp onwards
+        """
+        
+        # start reading at beginning
+        events = self.read_full()
+        
+        # skip unwanted events
+        for event in events :
+            if event.timestamp < dt :
+                continue
+
+            else :
+                # include this line as well
+                yield event
+                break
+        
+        # yield the rest as-is
+        for event in events :
+            yield event
+
+    def read_until (self, dt) :
+        """
+            Reads all LogLines up until the given naive timestamp
         """
 
-        # the list of lines
-        lines = []
+        # start reading events at the beginning
+        events = self.read_full()
+
+        # yield events until we hit the given timestamp
+        for event in events :
+            if event.timestamp <= dt :
+                yield event
+
+            else :
+                break
+            
+        # ignore the rest
+        return
+
+    def _read_blocks_reverse (self, blocksize=1024) :
+        """
+            Yields blocks of file data in reverse order, starting at the end of the file
+        """
 
         # seek to end of file
         self.file.seek(0, os.SEEK_END)
@@ -64,20 +118,14 @@
         # read offset
         # XXX: hack -1 to get rid of trailing newline
         size = offset = self.file.tell() - 1
-
-        # use this blocksize
-        BLOCKSIZE = 1024
-
-        # trailing data
-        buf = ''
-
-        # read a block at a time, backwards
-        while len(lines) < count and offset > 0:
+        
+        # do not try to read past the beginning of the file
+        while offset > 0:
             # calc new offset + size
-            if offset > BLOCKSIZE :
+            if offset > blocksize :
                 # full block
-                offset -= BLOCKSIZE
-                read_size = BLOCKSIZE
+                offset -= blocksize
+                read_size = blocksize
 
             else :
                 # partial block
@@ -88,47 +136,77 @@
             self.file.seek(offset)
 
             # read the data we want
-            read_buf = self.file.read(read_size)
-            read_len = len(read_buf)
+            block = self.file.read(read_size)
 
             # sanity check
-            assert read_len == read_size
+            assert len(block) == read_size
 
+            # yield 
+            yield block
+    
+    def _read_lines_reverse (self) :
+        """
+            Yields decoded lines from the end of the file, in reverse order.
+        """
+
+        # partial lines
+        buf = ''
+        
+        # read from end of file, a block at a time
+        for block in self._read_blocks_reverse() :
             # add in our previous buf
-            buf = read_buf + buf
+            buf = block + buf
             
-            # split out lines
-            buf_lines = buf.split(self.sep)
+            # split up lines
+            lines = buf.split(self.sep)
 
             # keep the first one as our buffer, as it's incomplete
-            buf = buf_lines[0]
+            buf = lines[0]
+           
+            # yield the rest a line at a time in reverse order... this looks weird, but that's how slicing works :)
+            # XXX: use something like islice, this has to build a slice object
+            for line in lines[:0:-1] :
+                yield line.decode(self.charset)
 
-            # prepend up to count lines from the end to our lines buffer
-            lines = buf_lines[-min(count, len(buf_lines) - 1):] + lines
+    def get_latest (self, count) :
+        """
+            Returns up to count events, from the end of the file, or less, if the file doesn't contain that many lines.
+        """
+
+        # the list of lines
+        lines = []
+
+        # start reading lines into lines
+        for line in self._read_lines_reverse() :
+            # append
+            lines.append(line)
+
+            # done?
+            if len(lines) >= count :
+                break
         
-        # decode
-        # XXX: better queue implementation, plz
-        lines = [line.decode(self.charset) for line in lines]
-
-        # return the line list
-        return lines
+        # decode in reverse order, using our starting date....
+        # XXX: use lines[::-1] or reversed?
+        # XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that
+        return self.parser.parse_lines(reversed(lines), self.start_date)
 
 class LogDirectory (LogSource) :
     """
         A directory containing a series of timestamped LogFiles
     """
 
-    def __init__ (self, path, tz, charset='utf-8', filename_fmt='%Y-%m-%d') :
+    def __init__ (self, path, tz, parser, charset='utf-8', filename_fmt='%Y-%m-%d') :
         """
             Load the logfiles at the given path.
             
             The files contain data in the given charset, and are named according the the date in the given timezone and
-            date format.
+            date format, and will be parsed using the given parser.
         """
 
         # store
         self.path = path
         self.tz = tz
+        self.parser = parser
         self.charset = charset
         self.filename_fmt = filename_fmt
 
@@ -155,9 +233,9 @@
         path = os.path.join(self.path, filename)
 
         # return the LogFile
-        return LogFile(path, self.charset)
+        return LogFile(path, self.parser, d, self.charset)
     
-    def _iter_backwards (self, dt=None) :
+    def _iter_date_reverse (self, dt=None) :
         """
             Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the
             given *datetime*, or the the current date, if none given
@@ -165,13 +243,13 @@
         
         # default to now
         if not dt :
-            dt = datetime.now(pytz.utc)
+            dt = datetime.datetime.now(pytz.utc)
         
         # convert to target timezone
         dtz = dt.astimezone(self.tz)
 
         # our timedelta
-        ONE_DAY = timedelta(1)
+        ONE_DAY = datetime.timedelta(1)
         
         # iterate unto infinity
         while True :
@@ -187,7 +265,7 @@
         """
         
         # iterate backwards from now
-        day_iter = self._iter_backwards()
+        day_iter = self._iter_date_reverse()
 
         # number of files read
         files = 0
@@ -195,7 +273,7 @@
         # only read up to 100 files or so
         MAX_FILES = 100
 
-        # read the lines into here
+        # read the events into here
         lines = []
         
         # loop until done
@@ -221,9 +299,37 @@
                     # skip to next day
                     continue
             
-            # read the lines
-            lines = logfile.get_latest(count) + lines
+            # read the events
+            # XXX: use a queue
+            lines = list(logfile.get_latest(count)) + lines
         
-        # return the lines
+        # return the events
         return lines
 
+    def get_date (self, dt) :
+        """
+            A 'day' is considered to be a 24-hour period from 00:00:00 23:59:59. If the timezone of the given datetime
+            differs from our native datetime, this may involve lines from more than one logfile.
+        """
+
+        # begin/end of 24h period, in target timezone
+        dtz_begin = dt.replace(hour=0, minute=0, second=0).astimezone(self.tz)
+        dtz_end = dt.replace(hour=23, minute=59, second=59, microsecond=999999).astimezone(self.tz)
+
+        # as dates
+        d_begin = dtz_begin.date() 
+        d_end = dtz_end.date()
+
+        # if they're the same, just pull the full log for that date
+        if d_begin == d_end :
+            return self._get_logfile_date(d_begin).read_full()
+        
+        # otherwise, we need to pull two partial logs
+        else :
+            # open both of them
+            f_begin = self._get_logfile_date(d_begin)
+            f_end = self._get_logfile_date(d_end)
+            
+            # chain together the two sources
+            return itertools.chain(f_begin.read_from(dtz_begin), f_end.read_until(dtz_end))
+