sites/irclogs.qmsk.net/log_source.py
changeset 46 185504387370
parent 45 e94ab812c0c8
child 47 3d59c9eeffaa
equal deleted inserted replaced
45:e94ab812c0c8 46:185504387370
     1 """
       
     2     A source of IRC log files
       
     3 """
       
     4 
       
     5 import codecs
       
     6 from datetime import date, datetime, timedelta
       
     7 import pytz
       
     8 
       
     9 # for SEEK_*, errno
       
    10 import os, errno
       
    11 
       
    12 class LogSource (object) :
       
    13     """
       
    14         A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
       
    15     """
       
    16     
       
    17     def get_latest (self, count) :
       
    18         """
       
    19             Yield the latest events, up to `count` of them.
       
    20         """
       
    21 
       
    22         abstract
       
    23 
       
    24 class LogFile (LogSource) :
       
    25     """
       
    26         A file containing LogEvents
       
    27     """
       
    28 
       
    29     def __init__ (self, path, charset='utf-8', sep='\n') :
       
    30         """
       
    31             Open the file at the given path, which contains data of the given codec, as lines separated by the given separator
       
    32         """
       
    33         
       
    34         # store
       
    35         self.path = path
       
    36         self.charset = charset
       
    37         self.sep = sep
       
    38 
       
    39         # open
       
    40         self.file = codecs.open(path, 'r', charset)
       
    41     
       
    42     def __iter__ (self) :
       
    43         """
       
    44             Yields a series of lines, as read from the top of the file
       
    45         """
       
    46         
       
    47         # seek to beginning
       
    48         self.file.seek(0)
       
    49 
       
    50         # iterate over lines
       
    51         return iter(self.file)
       
    52     
       
    53     def get_latest (self, count) :
       
    54         """
       
    55             Returns up to <count> lines from the end of the file, or less, if the file doesn't contain that many lines
       
    56         """
       
    57 
       
    58         # the list of lines
       
    59         lines = []
       
    60 
       
    61         # seek to end of file
       
    62         self.file.seek(0, os.SEEK_END)
       
    63 
       
    64         # read offset
       
    65         # XXX; why -2 ?
       
    66         size = offset = self.file.tell() - 2
       
    67 
       
    68         # use this blocksize
       
    69         BLOCKSIZE = 1024
       
    70 
       
    71         # trailing data
       
    72         buf = ''
       
    73 
       
    74         # read a block at a time, backwards
       
    75         while  count > 0 and offset >= 0:
       
    76             # update offset back one block
       
    77             offset -= BLOCKSIZE
       
    78 
       
    79             # normalize to zero
       
    80             if offset < 0 :
       
    81                 offset = 0
       
    82 
       
    83             # seek to offset
       
    84             self.file.seek(offset)
       
    85 
       
    86             # add the new block to our buffer
       
    87             read_buf = self.file.read(BLOCKSIZE)
       
    88 
       
    89             # XXX: trim off extra...
       
    90             if len(read_buf) > BLOCKSIZE :
       
    91                 read_buf = read_buf[:BLOCKSIZE]
       
    92 
       
    93             # make sure we got the right amount of data
       
    94             assert len(read_buf) == BLOCKSIZE, "read(%d) @ %d/%d -> %d" % (BLOCKSIZE, offset, size, len(read_buf))
       
    95 
       
    96             # add in our previous buf
       
    97             buf = read_buf + buf
       
    98             
       
    99             # split out lines
       
   100             buf_lines = buf.split(self.sep)
       
   101 
       
   102             # keep the first one as our buffer, as it's incomplete
       
   103             buf = buf_lines[0]
       
   104 
       
   105             # add up to count lines to our lines buffer
       
   106             lines = buf_lines[-min(count, len(buf_lines) - 1):] + lines
       
   107 
       
   108             # update count
       
   109             count -= (len(buf_lines) - 1)
       
   110 
       
   111         # return the line list
       
   112         return lines
       
   113 
       
   114 class LogDirectory (LogSource) :
       
   115     """
       
   116         A directory containing a series of timestamped LogFiles
       
   117     """
       
   118 
       
   119     def __init__ (self, path, tz, charset='utf-8', filename_fmt='%Y-%m-%d') :
       
   120         """
       
   121             Load the logfiles at the given path.
       
   122             
       
   123             The files contain data in the given charset, and are named according the the date in the given timezone and
       
   124             date format.
       
   125         """
       
   126 
       
   127         # store
       
   128         self.path = path
       
   129         self.tz = tz
       
   130         self.charset = charset
       
   131         self.filename_fmt = filename_fmt
       
   132 
       
   133     def _get_logfile_datetime (self, dt) :
       
   134         """
       
   135             Get the logfile corresponding to the given datetime
       
   136         """
       
   137 
       
   138         # convert to target timezone
       
   139         dtz = dt.astimezone(self.tz)
       
   140         
       
   141         # convert to date and use that
       
   142         return self._get_logfile_date(dtz.date())
       
   143 
       
   144     def _get_logfile_date (self, d) :
       
   145         """
       
   146             Get the logfile corresponding to the given naive date in our timezone
       
   147         """
       
   148 
       
   149         # format filename
       
   150         filename = d.strftime(self.filename_fmt)
       
   151 
       
   152         # build path
       
   153         path = os.path.join(self.path, filename)
       
   154 
       
   155         # return the LogFile
       
   156         return LogFile(path, self.charset)
       
   157     
       
   158     def _iter_backwards (self, dt=None) :
       
   159         """
       
   160             Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the
       
   161             given *datetime*, or the the current date, if none given
       
   162         """
       
   163         
       
   164         # default to now
       
   165         if not dt :
       
   166             dt = datetime.now(pytz.utc)
       
   167         
       
   168         # convert to target timezone
       
   169         dtz = dt.astimezone(self.tz)
       
   170 
       
   171         # our timedelta
       
   172         ONE_DAY = timedelta(1)
       
   173         
       
   174         # iterate unto infinity
       
   175         while True :
       
   176             # yield
       
   177             yield dtz.date()
       
   178             
       
   179             # one day sdrawkcab
       
   180             dtz -= ONE_DAY
       
   181     
       
   182     def get_latest (self, count) :
       
   183         """
       
   184             Uses _iter_backwards + _get_logfile_date to read the yield the given lines from as many logfiles as needed
       
   185         """
       
   186         
       
   187         # iterate backwards from now
       
   188         day_iter = self._iter_backwards()
       
   189 
       
   190         # number of files read
       
   191         files = 0
       
   192 
       
   193         # only read up to 100 files or so
       
   194         MAX_FILES = 100
       
   195         
       
   196         # loop until done
       
   197         while count > 0 :
       
   198             logfile = None
       
   199 
       
   200             try :
       
   201                 # get next logfile
       
   202                 files += 1
       
   203                 
       
   204                 # open
       
   205                 logfile = self._get_logfile_date(day_iter.next())
       
   206             
       
   207             except IOError, e :
       
   208                 # skip nonexistant days if we haven't found any logs yet
       
   209                 if e.errno != errno.ENOENT :
       
   210                     raise
       
   211 
       
   212                 if files > MAX_FILES :
       
   213                     raise Exception("No recent logfiles found")
       
   214                 
       
   215                 else :
       
   216                     # skip to next day
       
   217                     continue
       
   218 
       
   219             # yield lines
       
   220             for line in logfile.get_latest(count) :
       
   221                 # yield while we still need to, otherwise, stop
       
   222                 if count > 0 :
       
   223                     # decrement
       
   224                     count -= 1
       
   225  
       
   226                     yield line
       
   227             
       
   228                 else :
       
   229                     break
       
   230 
       
   231