1 """ |
|
2 A source of IRC log files |
|
3 """ |
|
4 |
|
5 import codecs |
|
6 from datetime import date, datetime, timedelta |
|
7 import pytz |
|
8 |
|
9 # for SEEK_*, errno |
|
10 import os, errno |
|
11 |
|
12 class LogSource (object) : |
|
13 """ |
|
14 A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events |
|
15 """ |
|
16 |
|
17 def get_latest (self, count) : |
|
18 """ |
|
19 Yield the latest events, up to `count` of them. |
|
20 """ |
|
21 |
|
22 abstract |
|
23 |
|
24 class LogFile (LogSource) : |
|
25 """ |
|
26 A file containing LogEvents |
|
27 """ |
|
28 |
|
29 def __init__ (self, path, charset='utf-8', sep='\n') : |
|
30 """ |
|
31 Open the file at the given path, which contains data of the given codec, as lines separated by the given separator |
|
32 """ |
|
33 |
|
34 # store |
|
35 self.path = path |
|
36 self.charset = charset |
|
37 self.sep = sep |
|
38 |
|
39 # open |
|
40 self.file = codecs.open(path, 'r', charset) |
|
41 |
|
42 def __iter__ (self) : |
|
43 """ |
|
44 Yields a series of lines, as read from the top of the file |
|
45 """ |
|
46 |
|
47 # seek to beginning |
|
48 self.file.seek(0) |
|
49 |
|
50 # iterate over lines |
|
51 return iter(self.file) |
|
52 |
|
53 def get_latest (self, count) : |
|
54 """ |
|
55 Returns up to <count> lines from the end of the file, or less, if the file doesn't contain that many lines |
|
56 """ |
|
57 |
|
58 # the list of lines |
|
59 lines = [] |
|
60 |
|
61 # seek to end of file |
|
62 self.file.seek(0, os.SEEK_END) |
|
63 |
|
64 # read offset |
|
65 # XXX; why -2 ? |
|
66 size = offset = self.file.tell() - 2 |
|
67 |
|
68 # use this blocksize |
|
69 BLOCKSIZE = 1024 |
|
70 |
|
71 # trailing data |
|
72 buf = '' |
|
73 |
|
74 # read a block at a time, backwards |
|
75 while count > 0 and offset >= 0: |
|
76 # update offset back one block |
|
77 offset -= BLOCKSIZE |
|
78 |
|
79 # normalize to zero |
|
80 if offset < 0 : |
|
81 offset = 0 |
|
82 |
|
83 # seek to offset |
|
84 self.file.seek(offset) |
|
85 |
|
86 # add the new block to our buffer |
|
87 read_buf = self.file.read(BLOCKSIZE) |
|
88 |
|
89 # XXX: trim off extra... |
|
90 if len(read_buf) > BLOCKSIZE : |
|
91 read_buf = read_buf[:BLOCKSIZE] |
|
92 |
|
93 # make sure we got the right amount of data |
|
94 assert len(read_buf) == BLOCKSIZE, "read(%d) @ %d/%d -> %d" % (BLOCKSIZE, offset, size, len(read_buf)) |
|
95 |
|
96 # add in our previous buf |
|
97 buf = read_buf + buf |
|
98 |
|
99 # split out lines |
|
100 buf_lines = buf.split(self.sep) |
|
101 |
|
102 # keep the first one as our buffer, as it's incomplete |
|
103 buf = buf_lines[0] |
|
104 |
|
105 # add up to count lines to our lines buffer |
|
106 lines = buf_lines[-min(count, len(buf_lines) - 1):] + lines |
|
107 |
|
108 # update count |
|
109 count -= (len(buf_lines) - 1) |
|
110 |
|
111 # return the line list |
|
112 return lines |
|
113 |
|
114 class LogDirectory (LogSource) : |
|
115 """ |
|
116 A directory containing a series of timestamped LogFiles |
|
117 """ |
|
118 |
|
119 def __init__ (self, path, tz, charset='utf-8', filename_fmt='%Y-%m-%d') : |
|
120 """ |
|
121 Load the logfiles at the given path. |
|
122 |
|
123 The files contain data in the given charset, and are named according the the date in the given timezone and |
|
124 date format. |
|
125 """ |
|
126 |
|
127 # store |
|
128 self.path = path |
|
129 self.tz = tz |
|
130 self.charset = charset |
|
131 self.filename_fmt = filename_fmt |
|
132 |
|
133 def _get_logfile_datetime (self, dt) : |
|
134 """ |
|
135 Get the logfile corresponding to the given datetime |
|
136 """ |
|
137 |
|
138 # convert to target timezone |
|
139 dtz = dt.astimezone(self.tz) |
|
140 |
|
141 # convert to date and use that |
|
142 return self._get_logfile_date(dtz.date()) |
|
143 |
|
144 def _get_logfile_date (self, d) : |
|
145 """ |
|
146 Get the logfile corresponding to the given naive date in our timezone |
|
147 """ |
|
148 |
|
149 # format filename |
|
150 filename = d.strftime(self.filename_fmt) |
|
151 |
|
152 # build path |
|
153 path = os.path.join(self.path, filename) |
|
154 |
|
155 # return the LogFile |
|
156 return LogFile(path, self.charset) |
|
157 |
|
158 def _iter_backwards (self, dt=None) : |
|
159 """ |
|
160 Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the |
|
161 given *datetime*, or the the current date, if none given |
|
162 """ |
|
163 |
|
164 # default to now |
|
165 if not dt : |
|
166 dt = datetime.now(pytz.utc) |
|
167 |
|
168 # convert to target timezone |
|
169 dtz = dt.astimezone(self.tz) |
|
170 |
|
171 # our timedelta |
|
172 ONE_DAY = timedelta(1) |
|
173 |
|
174 # iterate unto infinity |
|
175 while True : |
|
176 # yield |
|
177 yield dtz.date() |
|
178 |
|
179 # one day sdrawkcab |
|
180 dtz -= ONE_DAY |
|
181 |
|
182 def get_latest (self, count) : |
|
183 """ |
|
184 Uses _iter_backwards + _get_logfile_date to read the yield the given lines from as many logfiles as needed |
|
185 """ |
|
186 |
|
187 # iterate backwards from now |
|
188 day_iter = self._iter_backwards() |
|
189 |
|
190 # number of files read |
|
191 files = 0 |
|
192 |
|
193 # only read up to 100 files or so |
|
194 MAX_FILES = 100 |
|
195 |
|
196 # loop until done |
|
197 while count > 0 : |
|
198 logfile = None |
|
199 |
|
200 try : |
|
201 # get next logfile |
|
202 files += 1 |
|
203 |
|
204 # open |
|
205 logfile = self._get_logfile_date(day_iter.next()) |
|
206 |
|
207 except IOError, e : |
|
208 # skip nonexistant days if we haven't found any logs yet |
|
209 if e.errno != errno.ENOENT : |
|
210 raise |
|
211 |
|
212 if files > MAX_FILES : |
|
213 raise Exception("No recent logfiles found") |
|
214 |
|
215 else : |
|
216 # skip to next day |
|
217 continue |
|
218 |
|
219 # yield lines |
|
220 for line in logfile.get_latest(count) : |
|
221 # yield while we still need to, otherwise, stop |
|
222 if count > 0 : |
|
223 # decrement |
|
224 count -= 1 |
|
225 |
|
226 yield line |
|
227 |
|
228 else : |
|
229 break |
|
230 |
|
231 |
|