log_source.py
changeset 50 f13cf27a360b
parent 48 7858b7b8ffe3
child 54 b65a95eb9f6b
equal deleted inserted replaced
49:aaa62c8e5bd5 50:f13cf27a360b
     1 """
     1 """
     2     A source of IRC log files
     2     A source of IRC log files
     3 """
     3 """
     4 
     4 
     5 import codecs
     5 import datetime, itertools
     6 from datetime import date, datetime, timedelta
     6 import os, errno
     7 import pytz
     7 import pytz
     8 
     8 
     9 # for SEEK_*, errno
       
    10 import os, errno
       
    11 
       
    12 class LogSource (object) :
     9 class LogSource (object) :
    13     """
    10     """
    14         A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
    11         A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
    15     """
    12     """
    16     
    13     
    18         """
    15         """
    19             Yield the latest events, up to `count` of them.
    16             Yield the latest events, up to `count` of them.
    20         """
    17         """
    21 
    18 
    22         abstract
    19         abstract
       
    20     
       
    21     def get_date (self, dt) :
       
    22         """
       
    23             Get logs for the given date (as a datetime)
       
    24         """
       
    25 
       
    26         abstract
    23 
    27 
    24 class LogFile (LogSource) :
    28 class LogFile (LogSource) :
    25     """
    29     """
    26         A file containing LogEvents
    30         A file containing LogEvents
    27     """
    31     """
    28 
    32 
    29     def __init__ (self, path, charset='utf-8', sep='\n') :
    33     def __init__ (self, path, parser, start_date=None, charset='utf-8', sep='\n') :
    30         """
    34         """
    31             Open the file at the given path, which contains data of the given codec, as lines separated by the given separator
    35             Open the file at the given path, which contains data with the given charset, as lines separated by the
       
    36             given separator. Lines are parsed using the given parser, using the given date as an initial date, see
       
    37             LogParser for more info. XXX: currently we assume start_date also for the end of the file
    32         """
    38         """
    33         
    39         
    34         # store
    40         # store
    35         self.path = path
    41         self.path = path
       
    42         self.parser = parser
       
    43         self.start_date = start_date
    36         self.charset = charset
    44         self.charset = charset
    37         self.sep = sep
    45         self.sep = sep
    38 
    46 
    39         # open
    47         # open
    40         self.file = open(path, 'rb')
    48         self.file = open(path, 'rb')
    41     
    49     
    42     def __iter__ (self) :
    50     def __iter__ (self) :
    43         """
    51         """
    44             Yields a series of lines, as read from the top of the file
    52             Yields a series of unicode lines, as read from the top of the file
    45         """
    53         """
    46         
    54         
    47         # seek to beginning
    55         # seek to beginning
    48         self.file.seek(0)
    56         self.file.seek(0)
    49 
    57 
    50         # iterate over lines
    58         # iterate over lines, decoding them as well
    51         return iter(self.file)
    59         return (line.decode(self.charset) for line in self.file)
    52     
    60     
    53     def get_latest (self, count) :
    61     def read_full (self) :
    54         """
    62         """
    55             Returns up to <count> lines from the end of the file, or less, if the file doesn't contain that many lines
    63             Reads all LogLines
    56         """
    64         """
    57 
    65         
    58         # the list of lines
    66         # just use our __iter__
    59         lines = []
    67         return self.parser.parse_lines(self, self.start_date)
       
    68 
       
    69     def read_from (self, dt) :
       
    70         """
       
    71             Reads all LogLines from the given naive timestamp onwards
       
    72         """
       
    73         
       
    74         # start reading at beginning
       
    75         events = self.read_full()
       
    76         
       
    77         # skip unwanted events
       
    78         for event in events :
       
    79             if event.timestamp < dt :
       
    80                 continue
       
    81 
       
    82             else :
       
    83                 # include this line as well
       
    84                 yield event
       
    85                 break
       
    86         
       
    87         # yield the rest as-is
       
    88         for event in events :
       
    89             yield event
       
    90 
       
    91     def read_until (self, dt) :
       
    92         """
       
    93             Reads all LogLines up until the given naive timestamp
       
    94         """
       
    95 
       
    96         # start reading events at the beginning
       
    97         events = self.read_full()
       
    98 
       
    99         # yield events until we hit the given timestamp
       
   100         for event in events :
       
   101             if event.timestamp <= dt :
       
   102                 yield event
       
   103 
       
   104             else :
       
   105                 break
       
   106             
       
   107         # ignore the rest
       
   108         return
       
   109 
       
   110     def _read_blocks_reverse (self, blocksize=1024) :
       
   111         """
       
   112             Yields blocks of file data in reverse order, starting at the end of the file
       
   113         """
    60 
   114 
    61         # seek to end of file
   115         # seek to end of file
    62         self.file.seek(0, os.SEEK_END)
   116         self.file.seek(0, os.SEEK_END)
    63 
   117 
    64         # read offset
   118         # read offset
    65         # XXX: hack -1 to get rid of trailing newline
   119         # XXX: hack -1 to get rid of trailing newline
    66         size = offset = self.file.tell() - 1
   120         size = offset = self.file.tell() - 1
    67 
   121         
    68         # use this blocksize
   122         # do not try to read past the beginning of the file
    69         BLOCKSIZE = 1024
   123         while offset > 0:
    70 
       
    71         # trailing data
       
    72         buf = ''
       
    73 
       
    74         # read a block at a time, backwards
       
    75         while len(lines) < count and offset > 0:
       
    76             # calc new offset + size
   124             # calc new offset + size
    77             if offset > BLOCKSIZE :
   125             if offset > blocksize :
    78                 # full block
   126                 # full block
    79                 offset -= BLOCKSIZE
   127                 offset -= blocksize
    80                 read_size = BLOCKSIZE
   128                 read_size = blocksize
    81 
   129 
    82             else :
   130             else :
    83                 # partial block
   131                 # partial block
    84                 read_size = offset
   132                 read_size = offset
    85                 offset = 0
   133                 offset = 0
    86 
   134 
    87             # seek to offset
   135             # seek to offset
    88             self.file.seek(offset)
   136             self.file.seek(offset)
    89 
   137 
    90             # read the data we want
   138             # read the data we want
    91             read_buf = self.file.read(read_size)
   139             block = self.file.read(read_size)
    92             read_len = len(read_buf)
       
    93 
   140 
    94             # sanity check
   141             # sanity check
    95             assert read_len == read_size
   142             assert len(block) == read_size
    96 
   143 
       
   144             # yield 
       
   145             yield block
       
   146     
       
   147     def _read_lines_reverse (self) :
       
   148         """
       
   149             Yields decoded lines from the end of the file, in reverse order.
       
   150         """
       
   151 
       
   152         # partial lines
       
   153         buf = ''
       
   154         
       
   155         # read from end of file, a block at a time
       
   156         for block in self._read_blocks_reverse() :
    97             # add in our previous buf
   157             # add in our previous buf
    98             buf = read_buf + buf
   158             buf = block + buf
    99             
   159             
   100             # split out lines
   160             # split up lines
   101             buf_lines = buf.split(self.sep)
   161             lines = buf.split(self.sep)
   102 
   162 
   103             # keep the first one as our buffer, as it's incomplete
   163             # keep the first one as our buffer, as it's incomplete
   104             buf = buf_lines[0]
   164             buf = lines[0]
   105 
   165            
   106             # prepend up to count lines from the end to our lines buffer
   166             # yield the rest a line at a time in reverse order... this looks weird, but that's how slicing works :)
   107             lines = buf_lines[-min(count, len(buf_lines) - 1):] + lines
   167             # XXX: use something like islice, this has to build a slice object
   108         
   168             for line in lines[:0:-1] :
   109         # decode
   169                 yield line.decode(self.charset)
   110         # XXX: better queue implementation, plz
   170 
   111         lines = [line.decode(self.charset) for line in lines]
   171     def get_latest (self, count) :
   112 
   172         """
   113         # return the line list
   173             Returns up to count events, from the end of the file, or less, if the file doesn't contain that many lines.
   114         return lines
   174         """
       
   175 
       
   176         # the list of lines
       
   177         lines = []
       
   178 
       
   179         # start reading lines into lines
       
   180         for line in self._read_lines_reverse() :
       
   181             # append
       
   182             lines.append(line)
       
   183 
       
   184             # done?
       
   185             if len(lines) >= count :
       
   186                 break
       
   187         
       
   188         # decode in reverse order, using our starting date....
       
   189         # XXX: use lines[::-1] or reversed?
       
   190         # XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that
       
   191         return self.parser.parse_lines(reversed(lines), self.start_date)
   115 
   192 
   116 class LogDirectory (LogSource) :
   193 class LogDirectory (LogSource) :
   117     """
   194     """
   118         A directory containing a series of timestamped LogFiles
   195         A directory containing a series of timestamped LogFiles
   119     """
   196     """
   120 
   197 
   121     def __init__ (self, path, tz, charset='utf-8', filename_fmt='%Y-%m-%d') :
   198     def __init__ (self, path, tz, parser, charset='utf-8', filename_fmt='%Y-%m-%d') :
   122         """
   199         """
   123             Load the logfiles at the given path.
   200             Load the logfiles at the given path.
   124             
   201             
   125             The files contain data in the given charset, and are named according the the date in the given timezone and
   202             The files contain data in the given charset, and are named according the the date in the given timezone and
   126             date format.
   203             date format, and will be parsed using the given parser.
   127         """
   204         """
   128 
   205 
   129         # store
   206         # store
   130         self.path = path
   207         self.path = path
   131         self.tz = tz
   208         self.tz = tz
       
   209         self.parser = parser
   132         self.charset = charset
   210         self.charset = charset
   133         self.filename_fmt = filename_fmt
   211         self.filename_fmt = filename_fmt
   134 
   212 
   135     def _get_logfile_datetime (self, dt) :
   213     def _get_logfile_datetime (self, dt) :
   136         """
   214         """
   153 
   231 
   154         # build path
   232         # build path
   155         path = os.path.join(self.path, filename)
   233         path = os.path.join(self.path, filename)
   156 
   234 
   157         # return the LogFile
   235         # return the LogFile
   158         return LogFile(path, self.charset)
   236         return LogFile(path, self.parser, d, self.charset)
   159     
   237     
   160     def _iter_backwards (self, dt=None) :
   238     def _iter_date_reverse (self, dt=None) :
   161         """
   239         """
   162             Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the
   240             Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the
   163             given *datetime*, or the the current date, if none given
   241             given *datetime*, or the the current date, if none given
   164         """
   242         """
   165         
   243         
   166         # default to now
   244         # default to now
   167         if not dt :
   245         if not dt :
   168             dt = datetime.now(pytz.utc)
   246             dt = datetime.datetime.now(pytz.utc)
   169         
   247         
   170         # convert to target timezone
   248         # convert to target timezone
   171         dtz = dt.astimezone(self.tz)
   249         dtz = dt.astimezone(self.tz)
   172 
   250 
   173         # our timedelta
   251         # our timedelta
   174         ONE_DAY = timedelta(1)
   252         ONE_DAY = datetime.timedelta(1)
   175         
   253         
   176         # iterate unto infinity
   254         # iterate unto infinity
   177         while True :
   255         while True :
   178             # yield
   256             # yield
   179             yield dtz.date()
   257             yield dtz.date()
   185         """
   263         """
   186             Uses _iter_backwards + _get_logfile_date to read the yield the given lines from as many logfiles as needed
   264             Uses _iter_backwards + _get_logfile_date to read the yield the given lines from as many logfiles as needed
   187         """
   265         """
   188         
   266         
   189         # iterate backwards from now
   267         # iterate backwards from now
   190         day_iter = self._iter_backwards()
   268         day_iter = self._iter_date_reverse()
   191 
   269 
   192         # number of files read
   270         # number of files read
   193         files = 0
   271         files = 0
   194 
   272 
   195         # only read up to 100 files or so
   273         # only read up to 100 files or so
   196         MAX_FILES = 100
   274         MAX_FILES = 100
   197 
   275 
   198         # read the lines into here
   276         # read the events into here
   199         lines = []
   277         lines = []
   200         
   278         
   201         # loop until done
   279         # loop until done
   202         while len(lines) < count :
   280         while len(lines) < count :
   203             logfile = None
   281             logfile = None
   219                 
   297                 
   220                 else :
   298                 else :
   221                     # skip to next day
   299                     # skip to next day
   222                     continue
   300                     continue
   223             
   301             
   224             # read the lines
   302             # read the events
   225             lines = logfile.get_latest(count) + lines
   303             # XXX: use a queue
   226         
   304             lines = list(logfile.get_latest(count)) + lines
   227         # return the lines
   305         
       
   306         # return the events
   228         return lines
   307         return lines
   229 
   308 
       
   309     def get_date (self, dt) :
       
   310         """
       
   311             A 'day' is considered to be a 24-hour period from 00:00:00 23:59:59. If the timezone of the given datetime
       
   312             differs from our native datetime, this may involve lines from more than one logfile.
       
   313         """
       
   314 
       
   315         # begin/end of 24h period, in target timezone
       
   316         dtz_begin = dt.replace(hour=0, minute=0, second=0).astimezone(self.tz)
       
   317         dtz_end = dt.replace(hour=23, minute=59, second=59, microsecond=999999).astimezone(self.tz)
       
   318 
       
   319         # as dates
       
   320         d_begin = dtz_begin.date() 
       
   321         d_end = dtz_end.date()
       
   322 
       
   323         # if they're the same, just pull the full log for that date
       
   324         if d_begin == d_end :
       
   325             return self._get_logfile_date(d_begin).read_full()
       
   326         
       
   327         # otherwise, we need to pull two partial logs
       
   328         else :
       
   329             # open both of them
       
   330             f_begin = self._get_logfile_date(d_begin)
       
   331             f_end = self._get_logfile_date(d_end)
       
   332             
       
   333             # chain together the two sources
       
   334             return itertools.chain(f_begin.read_from(dtz_begin), f_end.read_until(dtz_end))
       
   335