|
1 """ |
|
2 A source of IRC log files |
|
3 """ |
|
4 |
|
5 import codecs |
|
6 from datetime import date, datetime, timedelta |
|
7 import pytz |
|
8 |
|
9 # for SEEK_*, errno |
|
10 import os, errno |
|
11 |
|
12 class LogSource (object) : |
|
13 """ |
|
14 A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events |
|
15 """ |
|
16 |
|
17 def get_latest (self, count) : |
|
18 """ |
|
19 Yield the latest events, up to `count` of them. |
|
20 """ |
|
21 |
|
22 abstract |
|
23 |
|
24 class LogFile (LogSource) : |
|
25 """ |
|
26 A file containing LogEvents |
|
27 """ |
|
28 |
|
29 def __init__ (self, path, charset='utf-8', sep='\n') : |
|
30 """ |
|
31 Open the file at the given path, which contains data of the given codec, as lines separated by the given separator |
|
32 """ |
|
33 |
|
34 # store |
|
35 self.path = path |
|
36 self.charset = charset |
|
37 self.sep = sep |
|
38 |
|
39 # open |
|
40 self.file = codecs.open(path, 'r', charset) |
|
41 |
|
42 def __iter__ (self) : |
|
43 """ |
|
44 Yields a series of lines, as read from the top of the file |
|
45 """ |
|
46 |
|
47 # seek to beginning |
|
48 self.file.seek(0) |
|
49 |
|
50 # iterate over lines |
|
51 return iter(self.file) |
|
52 |
|
53 def get_latest (self, count) : |
|
54 """ |
|
55 Returns up to <count> lines from the end of the file, or less, if the file doesn't contain that many lines |
|
56 """ |
|
57 |
|
58 # the list of lines |
|
59 lines = [] |
|
60 |
|
61 # seek to end of file |
|
62 self.file.seek(0, os.SEEK_END) |
|
63 |
|
64 # read offset |
|
65 # XXX; why -2 ? |
|
66 offset = self.file.tell() - 2 |
|
67 |
|
68 # use this blocksize |
|
69 BLOCKSIZE = 1024 |
|
70 |
|
71 # trailing data |
|
72 buf = '' |
|
73 |
|
74 # read a block at a time, backwards |
|
75 while count > 0 and offset >= 0: |
|
76 # update offset |
|
77 offset -= BLOCKSIZE |
|
78 |
|
79 # normalize to zero |
|
80 if offset < 0 : |
|
81 offset = 0 |
|
82 |
|
83 # seek backwards one block |
|
84 self.file.seek(offset) |
|
85 |
|
86 # add the new block to our buffer |
|
87 read_buf = self.file.read(BLOCKSIZE) |
|
88 |
|
89 # make sure we got the right amount of data |
|
90 assert len(read_buf) == BLOCKSIZE, "read(%d) -> %d" % (BLOCKSIZE, len(read_buf)) |
|
91 |
|
92 # add in our previous buf |
|
93 buf = read_buf + buf |
|
94 |
|
95 # split out lines |
|
96 buf_lines = buf.split(self.sep) |
|
97 |
|
98 # keep the first one as our buffer, as it's incomplete |
|
99 buf = buf_lines[0] |
|
100 |
|
101 # add up to count lines to our lines buffer |
|
102 lines = buf_lines[1:count + 1] + lines |
|
103 |
|
104 # update count |
|
105 count -= (len(buf_lines) - 1) |
|
106 |
|
107 # return the line list |
|
108 return lines |
|
109 |
|
110 class LogDirectory (LogSource) : |
|
111 """ |
|
112 A directory containing a series of timestamped LogFiles |
|
113 """ |
|
114 |
|
115 def __init__ (self, path, tz, charset='utf-8', filename_fmt='%Y-%m-%d') : |
|
116 """ |
|
117 Load the logfiles at the given path. |
|
118 |
|
119 The files contain data in the given charset, and are named according the the date in the given timezone and |
|
120 date format. |
|
121 """ |
|
122 |
|
123 # store |
|
124 self.path = path |
|
125 self.tz = tz |
|
126 self.charset = charset |
|
127 self.filename_fmt = filename_fmt |
|
128 |
|
129 def _get_logfile_datetime (self, dt) : |
|
130 """ |
|
131 Get the logfile corresponding to the given datetime |
|
132 """ |
|
133 |
|
134 # convert to target timezone |
|
135 dtz = dt.astimezone(self.tz) |
|
136 |
|
137 # convert to date and use that |
|
138 return self._get_logfile_date(dtz.date()) |
|
139 |
|
140 def _get_logfile_date (self, d) : |
|
141 """ |
|
142 Get the logfile corresponding to the given naive date in our timezone |
|
143 """ |
|
144 |
|
145 # format filename |
|
146 filename = d.strftime(self.filename_fmt) |
|
147 |
|
148 # build path |
|
149 path = os.path.join(self.path, filename) |
|
150 |
|
151 # return the LogFile |
|
152 return LogFile(path, self.charset) |
|
153 |
|
154 def _iter_backwards (self, dt=None) : |
|
155 """ |
|
156 Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the |
|
157 given *datetime*, or the the current date, if none given |
|
158 """ |
|
159 |
|
160 # default to now |
|
161 if not dt : |
|
162 dt = datetime.now(pytz.utc) |
|
163 |
|
164 # convert to target timezone |
|
165 dtz = dt.astimezone(self.tz) |
|
166 |
|
167 # our timedelta |
|
168 ONE_DAY = timedelta(1) |
|
169 |
|
170 # iterate unto infinity |
|
171 while True : |
|
172 # yield |
|
173 yield dtz.date() |
|
174 |
|
175 # one day sdrawkcab |
|
176 dtz -= ONE_DAY |
|
177 |
|
178 def get_latest (self, count) : |
|
179 """ |
|
180 Uses _iter_backwards + _get_logfile_date to read the yield the given lines from as many logfiles as needed |
|
181 """ |
|
182 |
|
183 # iterate backwards from now |
|
184 day_iter = self._iter_backwards() |
|
185 |
|
186 # number of files read |
|
187 files = 0 |
|
188 |
|
189 # only read up to 100 files or so |
|
190 MAX_FILES = 100 |
|
191 |
|
192 # loop until done |
|
193 while count > 0 : |
|
194 logfile = None |
|
195 |
|
196 try : |
|
197 # get next logfile |
|
198 files += 1 |
|
199 |
|
200 # open |
|
201 logfile = self._get_logfile_date(day_iter.next()) |
|
202 |
|
203 except IOError, e : |
|
204 # skip nonexistant days if we haven't found any logs yet |
|
205 if e.errno != errno.ENOENT : |
|
206 raise |
|
207 |
|
208 if files > MAX_FILES : |
|
209 raise Exception("No recent logfiles found") |
|
210 |
|
211 else : |
|
212 # skip to next day |
|
213 continue |
|
214 |
|
215 # yield lines |
|
216 for line in logfile.get_latest(count) : |
|
217 # yield while we still need to, otherwise, stop |
|
218 if count > 0 : |
|
219 # decrement |
|
220 count -= 1 |
|
221 |
|
222 yield line |
|
223 |
|
224 else : |
|
225 break |
|
226 |
|
227 |