sites/irclogs.qmsk.net/log_source.py
branchsites
changeset 41 9585441a4bfb
child 43 fc11c4e86a82
equal deleted inserted replaced
40:71ab68f31a1c 41:9585441a4bfb
       
     1 """
       
     2     A source of IRC log files
       
     3 """
       
     4 
       
     5 import codecs
       
     6 from datetime import date, datetime, timedelta
       
     7 import pytz
       
     8 
       
     9 # for SEEK_*, errno
       
    10 import os, errno
       
    11 
       
    12 class LogSource (object) :
       
    13     """
       
    14         A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
       
    15     """
       
    16     
       
    17     def get_latest (self, count) :
       
    18         """
       
    19             Yield the latest events, up to `count` of them.
       
    20         """
       
    21 
       
    22         abstract
       
    23 
       
    24 class LogFile (LogSource) :
       
    25     """
       
    26         A file containing LogEvents
       
    27     """
       
    28 
       
    29     def __init__ (self, path, charset='utf-8', sep='\n') :
       
    30         """
       
    31             Open the file at the given path, which contains data of the given codec, as lines separated by the given separator
       
    32         """
       
    33         
       
    34         # store
       
    35         self.path = path
       
    36         self.charset = charset
       
    37         self.sep = sep
       
    38 
       
    39         # open
       
    40         self.file = codecs.open(path, 'r', charset)
       
    41     
       
    42     def __iter__ (self) :
       
    43         """
       
    44             Yields a series of lines, as read from the top of the file
       
    45         """
       
    46         
       
    47         # seek to beginning
       
    48         self.file.seek(0)
       
    49 
       
    50         # iterate over lines
       
    51         return iter(self.file)
       
    52     
       
    53     def get_latest (self, count) :
       
    54         """
       
    55             Returns up to <count> lines from the end of the file, or less, if the file doesn't contain that many lines
       
    56         """
       
    57 
       
    58         # the list of lines
       
    59         lines = []
       
    60 
       
    61         # seek to end of file
       
    62         self.file.seek(0, os.SEEK_END)
       
    63 
       
    64         # read offset
       
    65         # XXX; why -2 ?
       
    66         offset = self.file.tell() - 2
       
    67 
       
    68         # use this blocksize
       
    69         BLOCKSIZE = 1024
       
    70 
       
    71         # trailing data
       
    72         buf = ''
       
    73 
       
    74         # read a block at a time, backwards
       
    75         while  count > 0 and offset >= 0:
       
    76             # update offset
       
    77             offset -= BLOCKSIZE
       
    78 
       
    79             # normalize to zero
       
    80             if offset < 0 :
       
    81                 offset = 0
       
    82 
       
    83             # seek backwards one block
       
    84             self.file.seek(offset)
       
    85 
       
    86             # add the new block to our buffer
       
    87             read_buf = self.file.read(BLOCKSIZE)
       
    88 
       
    89             # make sure we got the right amount of data
       
    90             assert len(read_buf) == BLOCKSIZE, "read(%d) -> %d" % (BLOCKSIZE, len(read_buf))
       
    91 
       
    92             # add in our previous buf
       
    93             buf = read_buf + buf
       
    94             
       
    95             # split out lines
       
    96             buf_lines = buf.split(self.sep)
       
    97 
       
    98             # keep the first one as our buffer, as it's incomplete
       
    99             buf = buf_lines[0]
       
   100 
       
   101             # add up to count lines to our lines buffer
       
   102             lines = buf_lines[1:count + 1] + lines
       
   103 
       
   104             # update count
       
   105             count -= (len(buf_lines) - 1)
       
   106 
       
   107         # return the line list
       
   108         return lines
       
   109 
       
   110 class LogDirectory (LogSource) :
       
   111     """
       
   112         A directory containing a series of timestamped LogFiles
       
   113     """
       
   114 
       
   115     def __init__ (self, path, tz, charset='utf-8', filename_fmt='%Y-%m-%d') :
       
   116         """
       
   117             Load the logfiles at the given path.
       
   118             
       
   119             The files contain data in the given charset, and are named according the the date in the given timezone and
       
   120             date format.
       
   121         """
       
   122 
       
   123         # store
       
   124         self.path = path
       
   125         self.tz = tz
       
   126         self.charset = charset
       
   127         self.filename_fmt = filename_fmt
       
   128 
       
   129     def _get_logfile_datetime (self, dt) :
       
   130         """
       
   131             Get the logfile corresponding to the given datetime
       
   132         """
       
   133 
       
   134         # convert to target timezone
       
   135         dtz = dt.astimezone(self.tz)
       
   136         
       
   137         # convert to date and use that
       
   138         return self._get_logfile_date(dtz.date())
       
   139 
       
   140     def _get_logfile_date (self, d) :
       
   141         """
       
   142             Get the logfile corresponding to the given naive date in our timezone
       
   143         """
       
   144 
       
   145         # format filename
       
   146         filename = d.strftime(self.filename_fmt)
       
   147 
       
   148         # build path
       
   149         path = os.path.join(self.path, filename)
       
   150 
       
   151         # return the LogFile
       
   152         return LogFile(path, self.charset)
       
   153     
       
   154     def _iter_backwards (self, dt=None) :
       
   155         """
       
   156             Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the
       
   157             given *datetime*, or the the current date, if none given
       
   158         """
       
   159         
       
   160         # default to now
       
   161         if not dt :
       
   162             dt = datetime.now(pytz.utc)
       
   163         
       
   164         # convert to target timezone
       
   165         dtz = dt.astimezone(self.tz)
       
   166 
       
   167         # our timedelta
       
   168         ONE_DAY = timedelta(1)
       
   169         
       
   170         # iterate unto infinity
       
   171         while True :
       
   172             # yield
       
   173             yield dtz.date()
       
   174             
       
   175             # one day sdrawkcab
       
   176             dtz -= ONE_DAY
       
   177     
       
   178     def get_latest (self, count) :
       
   179         """
       
   180             Uses _iter_backwards + _get_logfile_date to read the yield the given lines from as many logfiles as needed
       
   181         """
       
   182         
       
   183         # iterate backwards from now
       
   184         day_iter = self._iter_backwards()
       
   185 
       
   186         # number of files read
       
   187         files = 0
       
   188 
       
   189         # only read up to 100 files or so
       
   190         MAX_FILES = 100
       
   191         
       
   192         # loop until done
       
   193         while count > 0 :
       
   194             logfile = None
       
   195 
       
   196             try :
       
   197                 # get next logfile
       
   198                 files += 1
       
   199                 
       
   200                 # open
       
   201                 logfile = self._get_logfile_date(day_iter.next())
       
   202             
       
   203             except IOError, e :
       
   204                 # skip nonexistant days if we haven't found any logs yet
       
   205                 if e.errno != errno.ENOENT :
       
   206                     raise
       
   207 
       
   208                 if files > MAX_FILES :
       
   209                     raise Exception("No recent logfiles found")
       
   210                 
       
   211                 else :
       
   212                     # skip to next day
       
   213                     continue
       
   214 
       
   215             # yield lines
       
   216             for line in logfile.get_latest(count) :
       
   217                 # yield while we still need to, otherwise, stop
       
   218                 if count > 0 :
       
   219                     # decrement
       
   220                     count -= 1
       
   221  
       
   222                     yield line
       
   223             
       
   224                 else :
       
   225                     break
       
   226 
       
   227