log_source.py
changeset 140 6db2527b67cf
parent 139 9c7769850195
child 141 65c98c9e1716
equal deleted inserted replaced
139:9c7769850195 140:6db2527b67cf
     1 """
       
     2     A source of IRC log files
       
     3 """
       
     4 
       
     5 import datetime, calendar, itertools, functools, math
       
     6 import os, os.path, errno
       
     7 import pytz
       
     8 
       
     9 import config, utils
       
    10 
       
    11 # a timedelta that represents one day
       
    12 ONE_DAY = datetime.timedelta(days=1)
       
    13 
       
    14 class LogSourceDecoder (object) :
       
    15     """
       
    16         Handles decoding of LogSource lines
       
    17     """
       
    18 
       
    19     def __init__ (self, encoding_list) :
       
    20         """
       
    21             Will try each of the given (charset, errors) items in turn, until one succeeds
       
    22         """
       
    23 
       
    24         self.encoding_list = encoding_list
       
    25     
       
    26     def decode (self, line) :
       
    27         """
       
    28             Decode the line of str() text into an unicode object
       
    29         """
       
    30         
       
    31         # list of errors encountered
       
    32         error_list = []
       
    33         
       
    34         # try each in turn
       
    35         for charset, errors in self.encoding_list :
       
    36             # trap UnicodeDecodeError to try with the next one
       
    37             try :
       
    38                 return line.decode(charset, errors)
       
    39 
       
    40             except UnicodeDecodeError, e :
       
    41                 error_list.append("%s:%s - %s" % (charset, errors, e))
       
    42                 continue
       
    43 
       
    44         # failure
       
    45         raise UnicodeDecodeError("Failed to decode line: %r: %s" % (line, ', '.join(error_list)))
       
    46 
       
    47 class LogSource (object) :
       
    48     """
       
    49         A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events
       
    50     """
       
    51     
       
    52     def __init__ (self, decoder, channel=None) :
       
    53         """
       
    54             The appropriate LogChannel must be given, as we need to be able to construct the LogLines. If it is not yet
       
    55             known, then it can be given as None, and set later with bind_channel.
       
    56 
       
    57             Uses the given LogSourceDecoder to decode the lines.
       
    58         """
       
    59         
       
    60         self.channel = channel
       
    61         self.decoder = decoder
       
    62     
       
    63     def bind_channel (self, channel) :
       
    64         """
       
    65             Set this source's channel, where None was set before
       
    66         """
       
    67 
       
    68         assert not self.channel
       
    69 
       
    70         self.channel = channel
       
    71 
       
    72     def get_latest (self, count) :
       
    73         """
       
    74             Yield the latest events, up to `count` of them.
       
    75         """
       
    76 
       
    77         abstract
       
    78     
       
    79     def get_date (self, dt) :
       
    80         """
       
    81             Get logs for the given date (as a datetime).
       
    82         """
       
    83 
       
    84         abstract
       
    85     
       
    86     def get_date_paged (self, dt, count, page=None) :
       
    87         """
       
    88             Get the logs for a given date (as a datetime), divided into pages of count each. If page is given, the time
       
    89             portion of the dt is ignored, and the lines for the given page are returned. Otherwise, if page is None,
       
    90             then the lines for the page containing the given timestamp is returned.
       
    91 
       
    92             The return value is a (page, max, lines) tuple.
       
    93         """
       
    94         
       
    95         # how to act?
       
    96         if page :
       
    97             # constant skip
       
    98             skip = (page - 1) * count
       
    99 
       
   100         else :
       
   101             skip = None
       
   102 
       
   103             # go through the logs a page at a time
       
   104             this_page = 1
       
   105 
       
   106             # last line's timestamp
       
   107             last_ts = None
       
   108 
       
   109             # found it yet?
       
   110             found = False
       
   111 
       
   112         # count the full number of lines
       
   113         line_count = 0
       
   114 
       
   115         # collect lines
       
   116         lines = []
       
   117 
       
   118         # iterate using get_date
       
   119         for line in self.get_date(dt) :
       
   120             # count them
       
   121             line_count += 1
       
   122 
       
   123             # skip?
       
   124             if skip :
       
   125                 skip -= 1
       
   126                 continue
       
   127             
       
   128             # is this page all that we want/need?
       
   129             if page or found :
       
   130                 # already full?
       
   131                 if len(lines) >= count :
       
   132                     continue
       
   133 
       
   134             # specfic timestamp
       
   135             else :
       
   136                 # didn't find it in this page?
       
   137                 if len(lines) >= count :
       
   138                     # reset to next page
       
   139                     lines = []
       
   140                     this_page += 1
       
   141 
       
   142                 # is dt between these two timestamps?
       
   143                 if (not last_ts or last_ts <= dt) and (dt <= line.timestamp) :
       
   144                     # found!
       
   145                     found = True
       
   146                     page = this_page
       
   147 
       
   148                 else :
       
   149                     # keep looking
       
   150                     last_ts = line.timestamp
       
   151 
       
   152             # store line
       
   153             lines.append(line)
       
   154         
       
   155         # calculate max_pages
       
   156         max_pages = math.ceil(float(line_count) / count)
       
   157         
       
   158         # return
       
   159         return (page, max_pages, lines)
       
   160 
       
   161     def get_month_days (self, dt) :
       
   162         """
       
   163             Return an ordered sequence of dates, telling which days in the given month (as a datetime) have logs available.
       
   164         """
       
   165 
       
   166         abstract
       
   167     
       
   168     def get_modified (self, dt=None, after=None, until=None) :
       
   169         """
       
   170             Returns a sequence of LogLines that may have been *modified* from their old values since the given datetime.
       
   171 
       
   172             If the datetime is not given, *all* lines are returned.
       
   173 
       
   174             If after is given, only lines from said date onwards will be returned, regardless of modification.
       
   175             If until is given, only lines up to and including said date will be returned, regardless of modification.
       
   176 
       
   177             The LogLines should be in time order.
       
   178         """
       
   179 
       
   180         abstract
       
   181     
       
   182     def get_prev_date (self, dt) :
       
   183         """
       
   184             Get the next distinct date of logs available preceeding the given date, or None
       
   185         """
       
   186 
       
   187         abstract
       
   188 
       
   189     def get_next_date (self, dt) :
       
   190         """
       
   191             Get the next distinct date of logs following the given date, or None.
       
   192         """
       
   193         
       
   194         abstract
       
   195 
       
   196 class LogFile (object) :
       
   197     """
       
   198         A file containing LogEvents
       
   199 
       
   200         XXX: modify to implement LogSource?
       
   201     """
       
   202 
       
   203     def __init__ (self, path, parser, decoder, channel=None, start_date=None, sep='\n') :
       
   204         """
       
   205             Open the file at the given path, which contains lines as separated by the given separator. Lines are
       
   206             decoded using the given LogSourceDecoder, and then parsed using the given parser, using the given date
       
   207             as the initial date for this log's first line.
       
   208             
       
   209             XXX: currently we assume start_date also for the end of the file
       
   210         """
       
   211         
       
   212         # store
       
   213         self.channel = channel
       
   214         self.path = path
       
   215         self.parser = parser
       
   216         self.start_date = start_date
       
   217         self.decoder = decoder
       
   218         self.sep = sep
       
   219 
       
   220         # open
       
   221         self.file = open(path, 'rb')
       
   222 
       
   223     def __iter__ (self) :
       
   224         """
       
   225             Yields a series of unicode lines, as read from the top of the file
       
   226         """
       
   227         
       
   228         # seek to beginning
       
   229         self.file.seek(0)
       
   230 
       
   231         # iterate over lines, decoding them as well
       
   232         return (self.decoder.decode(line.rstrip(self.sep)) for line in self.file)
       
   233     
       
   234     def read_full (self) :
       
   235         """
       
   236             Reads all LogLines. The LogLines will have a valid offset.
       
   237         """
       
   238         
       
   239         # just use our __iter__
       
   240         return self.parser.parse_lines(self.channel, self, self.start_date, starting_offset=1)
       
   241 
       
   242     def read_from (self, dt) :
       
   243         """
       
   244             Reads all LogLines from the given naive timestamp onwards
       
   245         """
       
   246         
       
   247         # start reading at beginning
       
   248         events = self.read_full()
       
   249         
       
   250         # skip unwanted events
       
   251         for event in events :
       
   252             if event.timestamp < dt :
       
   253                 continue
       
   254 
       
   255             else :
       
   256                 # include this line as well
       
   257                 yield event
       
   258                 break
       
   259         
       
   260         # yield the rest as-is
       
   261         for event in events :
       
   262             yield event
       
   263 
       
   264     def read_until (self, dt) :
       
   265         """
       
   266             Reads all LogLines up until the given naive timestamp
       
   267         """
       
   268 
       
   269         # start reading events at the beginning
       
   270         events = self.read_full()
       
   271 
       
   272         # yield events until we hit the given timestamp
       
   273         for event in events :
       
   274             if event.timestamp <= dt :
       
   275                 yield event
       
   276 
       
   277             else :
       
   278                 break
       
   279             
       
   280         # ignore the rest
       
   281         return
       
   282 
       
   283     def _read_blocks_reverse (self, blocksize=1024) :
       
   284         """
       
   285             Yields blocks of file data in reverse order, starting at the end of the file
       
   286         """
       
   287 
       
   288         # seek to end of file
       
   289         self.file.seek(0, os.SEEK_END)
       
   290 
       
   291         # read offset
       
   292         # XXX: hack -1 to get rid of trailing newline
       
   293         size = offset = self.file.tell() - 1
       
   294         
       
   295         # do not try to read past the beginning of the file
       
   296         while offset > 0:
       
   297             # calc new offset + size
       
   298             if offset > blocksize :
       
   299                 # full block
       
   300                 offset -= blocksize
       
   301                 read_size = blocksize
       
   302 
       
   303             else :
       
   304                 # partial block
       
   305                 read_size = offset
       
   306                 offset = 0
       
   307 
       
   308             # seek to offset
       
   309             self.file.seek(offset)
       
   310 
       
   311             # read the data we want
       
   312             block = self.file.read(read_size)
       
   313 
       
   314             # sanity check
       
   315             assert len(block) == read_size
       
   316 
       
   317             # yield 
       
   318             yield block
       
   319     
       
   320     def _read_lines_reverse (self) :
       
   321         """
       
   322             Yields decoded lines from the end of the file, in reverse order.
       
   323         """
       
   324 
       
   325         # partial lines
       
   326         buf = ''
       
   327         
       
   328         # read from end of file, a block at a time
       
   329         for block in self._read_blocks_reverse() :
       
   330             # add in our previous buf
       
   331             buf = block + buf
       
   332             
       
   333             # split up lines
       
   334             lines = buf.split(self.sep)
       
   335 
       
   336             # keep the first one as our buffer, as it's incomplete
       
   337             buf = lines[0]
       
   338            
       
   339             # yield the rest a line at a time in reverse order... this looks weird, but that's how slicing works :)
       
   340             # XXX: use something like islice, this has to build a slice object
       
   341             for line in lines[:0:-1] :
       
   342                 yield self.decoder.decode(line)
       
   343 
       
   344     def read_latest (self, count) :
       
   345         """
       
   346             Returns up to count events, from the end of the file, or less, if the file doesn't contain that many lines.
       
   347         """
       
   348 
       
   349         # the list of lines
       
   350         lines = []
       
   351 
       
   352         # start reading lines into lines
       
   353         for line in self._read_lines_reverse() :
       
   354             # append
       
   355             lines.append(line)
       
   356 
       
   357             # done?
       
   358             if len(lines) >= count :
       
   359                 break
       
   360         
       
   361         # decode in reverse order, using our starting date....
       
   362         # XXX: use lines[::-1] or reversed?
       
   363         # XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that
       
   364         return self.parser.parse_lines(self.channel, reversed(lines), self.start_date)
       
   365 
       
   366 class LogDirectory (LogSource) :
       
   367     """
       
   368         A directory containing a series of timestamped LogFiles
       
   369     """
       
   370 
       
   371     def __init__ (self, path, tz, parser, decoder, filename_fmt, channel=None) :
       
   372         """
       
   373             Load the logfiles at the given path, which are for the given LogChannel
       
   374             
       
   375             Decode the file lines using the given decoder, the files are named according the the date in the given
       
   376             timezone and date format, and will be parsed using the given parser.
       
   377         """
       
   378 
       
   379         # store
       
   380         self.channel = channel
       
   381         self.path = path
       
   382         self.tz = tz
       
   383         self.parser = parser
       
   384         self.decoder = decoder
       
   385         self.filename_fmt = filename_fmt
       
   386 
       
   387     def _get_logfile_date (self, d, load=True, mtime=False, ignore_missing=False) :
       
   388         """
       
   389             Get the logfile corresponding to the given naive date in our timezone. 
       
   390             
       
   391             If load is False, only test for the presence of the logfile, do not actually open it. If mtime is given,
       
   392             then this returns the file's mtime
       
   393 
       
   394             Returns None if the logfile does not exist, unless ignore_missing is given as False.
       
   395         """
       
   396 
       
   397         # format filename
       
   398         filename = d.strftime(self.filename_fmt)
       
   399 
       
   400         # build path
       
   401         path = os.path.join(self.path, filename)
       
   402         
       
   403         try :
       
   404             if load :
       
   405                 # open+return the LogFile
       
   406                 return LogFile(path, self.parser, self.decoder, start_date=d, channel=self.channel)
       
   407             
       
   408             elif mtime :
       
   409                 # stat
       
   410                 return utils.mtime(path)
       
   411 
       
   412             else :
       
   413                 # test
       
   414                 return os.path.exists(path)
       
   415 
       
   416         # XXX: move to LogFile
       
   417         except IOError, e :
       
   418             # return None for missing files
       
   419             if e.errno == errno.ENOENT and ignore_missing :
       
   420                 return None
       
   421 
       
   422             else :
       
   423                 raise
       
   424     
       
   425     def _iter_logfile_dates (self, after=None, until=None, reverse=False) :
       
   426         """
       
   427             Yields a series of naive datetime objects representing the logfiles that are available, in time order.
       
   428             
       
   429             Parameters :
       
   430                 after   only dates from said date onwards will be returned
       
   431                 until   only dates up to and including said date will be returned
       
   432                 reverse the dates are returned in reverse order instead. Note that the meaning of after/until doesn't change
       
   433         """
       
   434 
       
   435         # convert timestamps to our timezone's dates
       
   436         if after :
       
   437             after = after.astimezone(self.tz).date()
       
   438 
       
   439         if until :
       
   440             until = until.astimezone(self.tz).date()
       
   441 
       
   442         # listdir
       
   443         filenames = os.listdir(self.path)
       
   444 
       
   445         # sort
       
   446         filenames.sort(reverse=reverse)
       
   447 
       
   448         # iter files
       
   449         for filename in filenames :
       
   450             try :
       
   451                 # parse date
       
   452                 dt = self.tz.localize(datetime.datetime.strptime(filename, self.filename_fmt))
       
   453                 date = dt.date()
       
   454             
       
   455             except :
       
   456                 # ignore
       
   457                 continue
       
   458 
       
   459             else :
       
   460                 if (after and date < after) or (until and date > until) :
       
   461                     # ignore
       
   462                     continue
       
   463                 
       
   464                 else :
       
   465                     # yield
       
   466                     yield dt
       
   467             
       
   468     def _iter_date_reverse (self, dt=None) :
       
   469         """
       
   470             Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the
       
   471             given *datetime*, or the the current date, if none given
       
   472         """
       
   473         
       
   474         # default to now
       
   475         if not dt :
       
   476             dtz = self.tz.localize(datetime.datetime.now())
       
   477 
       
   478         else :
       
   479             # convert to target timezone
       
   480             dtz = dt.astimezone(self.tz)
       
   481 
       
   482         # iterate unto infinity
       
   483         while True :
       
   484             # yield
       
   485             yield dtz.date()
       
   486             
       
   487             # one day sdrawkcab
       
   488             dtz -= ONE_DAY
       
   489     
       
   490     def _iter_logfile_reverse (self, dt=None, max_files=100) :
       
   491         """
       
   492             Yields a series of LogFile objects, iterating backwards in time starting at the given datetime, or the
       
   493             current date, if none given.
       
   494 
       
   495             Reads/probes at most max_files files.
       
   496         """
       
   497         
       
   498         # start counting at zero...
       
   499         file_count = 0
       
   500 
       
   501         # have we found any files at all so far?
       
   502         have_found = False
       
   503 
       
   504         # iterate backwards over days
       
   505         for day in self._iter_date_reverse(dt) :
       
   506             # stop if we've handled enough files by now
       
   507             if file_count > max_files :
       
   508                 break
       
   509             
       
   510             # try and open the next logfile
       
   511             logfile = None
       
   512             
       
   513             file_count += 1
       
   514             logfile = self._get_logfile_date(day, ignore_missing=True)
       
   515             
       
   516             # no logfile there?
       
   517             if not logfile :
       
   518                 # hit our limit?
       
   519                 if file_count > max_files :
       
   520                     # if we didn't find any logfiles at all, terminate rudely
       
   521                     if not have_found :
       
   522                         raise Exception("No recent logfiles found")
       
   523                     
       
   524                     else :
       
   525                         # stop looking, deal with what we've got
       
   526                         return
       
   527 
       
   528                 else :
       
   529                     # skip to next day
       
   530                     continue
       
   531             
       
   532             # mark have_found
       
   533             have_found = True
       
   534 
       
   535             # yield it
       
   536             yield logfile
       
   537 
       
   538     def get_latest (self, count) :
       
   539         """
       
   540             Uses _logfile_reverse to read the yield the given lines from as many logfiles as needed
       
   541         """
       
   542 
       
   543         # read the events into here
       
   544         lines = []
       
   545         
       
   546         # start reading in those logfiles
       
   547         for logfile in self._iter_logfile_reverse() :
       
   548             # read the events
       
   549             # XXX: use a queue
       
   550             lines = list(logfile.read_latest(count)) + lines
       
   551 
       
   552             # done?
       
   553             if len(lines) >= count :
       
   554                 break
       
   555         
       
   556         # return the events
       
   557         return lines
       
   558 
       
   559     def get_date (self, dt) :
       
   560         """
       
   561             A 'day' is considered to be a 24-hour period from 00:00:00 23:59:59. If the timezone of the given datetime
       
   562             differs from our native datetime, this may involve lines from more than one logfile.
       
   563         """
       
   564 
       
   565         # begin/end of 24h period, in target timezone
       
   566         dtz_begin = dt.replace(hour=0, minute=0, second=0).astimezone(self.tz)
       
   567         dtz_end = dt.replace(hour=23, minute=59, second=59, microsecond=999999).astimezone(self.tz)
       
   568 
       
   569         # as dates
       
   570         d_begin = dtz_begin.date() 
       
   571         d_end = dtz_end.date()
       
   572         
       
   573 #        print
       
   574 #        print "LogDirectory.get_date - %s" % dt
       
   575 #        print "\t   %s %s" % (d_begin, dtz_begin)
       
   576 #        print "\t-> %s %s" % (d_end, dtz_end)
       
   577 
       
   578         # if they're the same, just pull the full log for that date
       
   579         if d_begin == d_end :
       
   580             # open that log
       
   581             logfile = self._get_logfile_date(d_begin)
       
   582             
       
   583             # return the full data
       
   584             return logfile.read_full()
       
   585         
       
   586         # otherwise, we need to pull two partial logs
       
   587         else :
       
   588             # open both of them, but it's okay if we don't have the second one
       
   589             f_begin = self._get_logfile_date(d_begin)
       
   590             f_end = self._get_logfile_date(d_end, ignore_missing=True)
       
   591 
       
   592             # chain together the two sources
       
   593             return itertools.chain(
       
   594                 f_begin.read_from(dtz_begin), 
       
   595                 f_end.read_until(dtz_end) if f_end else []
       
   596             )
       
   597     
       
   598     def _iter_month_days (self, month) :
       
   599         """
       
   600             Iterates over the days of a month as dt objects with time=0
       
   601         """
       
   602         
       
   603         # there's at most 31 days in a month...
       
   604         for day in xrange(1, 32) :
       
   605             try :
       
   606                 # try and build the datetime
       
   607                 dt = datetime.datetime(month.year, month.month, day)
       
   608 
       
   609             except :
       
   610                 # stop
       
   611                 return
       
   612             
       
   613             else :
       
   614                 # fix timezones + yield
       
   615                 yield month.tzinfo.localize(dt)
       
   616 
       
   617     def get_month_days (self, month) :
       
   618         """
       
   619             Returns a set of dates for which logfiles are available in the given datetime's month
       
   620         """
       
   621 
       
   622         # iterate over month's days
       
   623         for dt in self._iter_month_days(month) :
       
   624             # date in our target timezone
       
   625             log_date = dt.astimezone(self.tz).date()
       
   626             
       
   627             # test for it
       
   628             if self._get_logfile_date(log_date, load=False, ignore_missing=True) :
       
   629                 # valid
       
   630                 yield dt.date()
       
   631 
       
   632     def get_modified (self, dt=None, after=None, until=None) :
       
   633         """
       
   634             Returns the contents off all logfiles with mtimes past the given date
       
   635         """
       
   636 
       
   637         # iterate through all available logfiles in date order, as datetimes, from the given date on
       
   638         for log_date in self._iter_logfile_dates(after, until) :
       
   639             # compare against dt?
       
   640             if dt :
       
   641                 # stat
       
   642                 mtime = self._get_logfile_date(log_date, load=False, mtime=True, ignore_missing=True)
       
   643                 
       
   644                 # not modified?
       
   645                 if mtime < dt :
       
   646                     # skip
       
   647                     continue
       
   648                 
       
   649             # open
       
   650             logfile = self._get_logfile_date(log_date)
       
   651 
       
   652             # yield all lines
       
   653             for line in logfile.read_full() :
       
   654                 yield line
       
   655 
       
   656     def get_prev_date (self, dt) :
       
   657         """
       
   658             Just use _iter_logfile_dates
       
   659         """
       
   660         
       
   661         # use for to "iter" once
       
   662         for log_date in self._iter_logfile_dates(until=dt - ONE_DAY, reverse=True) :
       
   663             return log_date
       
   664         
       
   665         else :
       
   666             return None
       
   667 
       
   668     def get_next_date (self, dt) :
       
   669         """
       
   670             Just use _iter_logfile_dates
       
   671         """
       
   672         
       
   673         # use for to "iter" once
       
   674         for log_date in self._iter_logfile_dates(after=dt + ONE_DAY) :
       
   675             return log_date
       
   676         
       
   677         else :
       
   678             return None
       
   679