author | Tero Marttila <terom@fixme.fi> |
Sun, 08 Feb 2009 02:55:53 +0200 | |
branch | sites |
changeset 43 | fc11c4e86a82 |
parent 41 | 9585441a4bfb |
permissions | -rw-r--r-- |
41 | 1 |
""" |
2 |
A source of IRC log files |
|
3 |
""" |
|
4 |
||
5 |
import codecs |
|
6 |
from datetime import date, datetime, timedelta |
|
7 |
import pytz |
|
8 |
||
9 |
# for SEEK_*, errno |
|
10 |
import os, errno |
|
11 |
||
12 |
class LogSource (object) : |
|
13 |
""" |
|
14 |
A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events |
|
15 |
""" |
|
16 |
||
17 |
def get_latest (self, count) : |
|
18 |
""" |
|
19 |
Yield the latest events, up to `count` of them. |
|
20 |
""" |
|
21 |
||
22 |
abstract |
|
23 |
||
24 |
class LogFile (LogSource) : |
|
25 |
""" |
|
26 |
A file containing LogEvents |
|
27 |
""" |
|
28 |
||
29 |
def __init__ (self, path, charset='utf-8', sep='\n') : |
|
30 |
""" |
|
31 |
Open the file at the given path, which contains data of the given codec, as lines separated by the given separator |
|
32 |
""" |
|
33 |
||
34 |
# store |
|
35 |
self.path = path |
|
36 |
self.charset = charset |
|
37 |
self.sep = sep |
|
38 |
||
39 |
# open |
|
40 |
self.file = codecs.open(path, 'r', charset) |
|
41 |
||
42 |
def __iter__ (self) : |
|
43 |
""" |
|
44 |
Yields a series of lines, as read from the top of the file |
|
45 |
""" |
|
46 |
||
47 |
# seek to beginning |
|
48 |
self.file.seek(0) |
|
49 |
||
50 |
# iterate over lines |
|
51 |
return iter(self.file) |
|
52 |
||
53 |
def get_latest (self, count) : |
|
54 |
""" |
|
55 |
Returns up to <count> lines from the end of the file, or less, if the file doesn't contain that many lines |
|
56 |
""" |
|
57 |
||
58 |
# the list of lines |
|
59 |
lines = [] |
|
60 |
||
61 |
# seek to end of file |
|
62 |
self.file.seek(0, os.SEEK_END) |
|
63 |
||
64 |
# read offset |
|
65 |
# XXX; why -2 ? |
|
43
fc11c4e86a82
implement channel_view count, the query stuff, css, layout all need some cleanup :(
Tero Marttila <terom@fixme.fi>
parents:
41
diff
changeset
|
66 |
size = offset = self.file.tell() - 2 |
41 | 67 |
|
68 |
# use this blocksize |
|
69 |
BLOCKSIZE = 1024 |
|
70 |
||
71 |
# trailing data |
|
72 |
buf = '' |
|
73 |
||
74 |
# read a block at a time, backwards |
|
75 |
while count > 0 and offset >= 0: |
|
43
fc11c4e86a82
implement channel_view count, the query stuff, css, layout all need some cleanup :(
Tero Marttila <terom@fixme.fi>
parents:
41
diff
changeset
|
76 |
# update offset back one block |
41 | 77 |
offset -= BLOCKSIZE |
78 |
||
79 |
# normalize to zero |
|
80 |
if offset < 0 : |
|
81 |
offset = 0 |
|
82 |
||
43
fc11c4e86a82
implement channel_view count, the query stuff, css, layout all need some cleanup :(
Tero Marttila <terom@fixme.fi>
parents:
41
diff
changeset
|
83 |
# seek to offset |
41 | 84 |
self.file.seek(offset) |
85 |
||
86 |
# add the new block to our buffer |
|
87 |
read_buf = self.file.read(BLOCKSIZE) |
|
88 |
||
43
fc11c4e86a82
implement channel_view count, the query stuff, css, layout all need some cleanup :(
Tero Marttila <terom@fixme.fi>
parents:
41
diff
changeset
|
89 |
# XXX: trim off extra... |
fc11c4e86a82
implement channel_view count, the query stuff, css, layout all need some cleanup :(
Tero Marttila <terom@fixme.fi>
parents:
41
diff
changeset
|
90 |
if len(read_buf) > BLOCKSIZE : |
fc11c4e86a82
implement channel_view count, the query stuff, css, layout all need some cleanup :(
Tero Marttila <terom@fixme.fi>
parents:
41
diff
changeset
|
91 |
read_buf = read_buf[:BLOCKSIZE] |
fc11c4e86a82
implement channel_view count, the query stuff, css, layout all need some cleanup :(
Tero Marttila <terom@fixme.fi>
parents:
41
diff
changeset
|
92 |
|
41 | 93 |
# make sure we got the right amount of data |
43
fc11c4e86a82
implement channel_view count, the query stuff, css, layout all need some cleanup :(
Tero Marttila <terom@fixme.fi>
parents:
41
diff
changeset
|
94 |
assert len(read_buf) == BLOCKSIZE, "read(%d) @ %d/%d -> %d" % (BLOCKSIZE, offset, size, len(read_buf)) |
41 | 95 |
|
96 |
# add in our previous buf |
|
97 |
buf = read_buf + buf |
|
98 |
||
99 |
# split out lines |
|
100 |
buf_lines = buf.split(self.sep) |
|
101 |
||
102 |
# keep the first one as our buffer, as it's incomplete |
|
103 |
buf = buf_lines[0] |
|
104 |
||
105 |
# add up to count lines to our lines buffer |
|
43
fc11c4e86a82
implement channel_view count, the query stuff, css, layout all need some cleanup :(
Tero Marttila <terom@fixme.fi>
parents:
41
diff
changeset
|
106 |
lines = buf_lines[-min(count, len(buf_lines) - 1):] + lines |
41 | 107 |
|
108 |
# update count |
|
109 |
count -= (len(buf_lines) - 1) |
|
110 |
||
111 |
# return the line list |
|
112 |
return lines |
|
113 |
||
114 |
class LogDirectory (LogSource) : |
|
115 |
""" |
|
116 |
A directory containing a series of timestamped LogFiles |
|
117 |
""" |
|
118 |
||
119 |
def __init__ (self, path, tz, charset='utf-8', filename_fmt='%Y-%m-%d') : |
|
120 |
""" |
|
121 |
Load the logfiles at the given path. |
|
122 |
||
123 |
The files contain data in the given charset, and are named according the the date in the given timezone and |
|
124 |
date format. |
|
125 |
""" |
|
126 |
||
127 |
# store |
|
128 |
self.path = path |
|
129 |
self.tz = tz |
|
130 |
self.charset = charset |
|
131 |
self.filename_fmt = filename_fmt |
|
132 |
||
133 |
def _get_logfile_datetime (self, dt) : |
|
134 |
""" |
|
135 |
Get the logfile corresponding to the given datetime |
|
136 |
""" |
|
137 |
||
138 |
# convert to target timezone |
|
139 |
dtz = dt.astimezone(self.tz) |
|
140 |
||
141 |
# convert to date and use that |
|
142 |
return self._get_logfile_date(dtz.date()) |
|
143 |
||
144 |
def _get_logfile_date (self, d) : |
|
145 |
""" |
|
146 |
Get the logfile corresponding to the given naive date in our timezone |
|
147 |
""" |
|
148 |
||
149 |
# format filename |
|
150 |
filename = d.strftime(self.filename_fmt) |
|
151 |
||
152 |
# build path |
|
153 |
path = os.path.join(self.path, filename) |
|
154 |
||
155 |
# return the LogFile |
|
156 |
return LogFile(path, self.charset) |
|
157 |
||
158 |
def _iter_backwards (self, dt=None) : |
|
159 |
""" |
|
160 |
Yields an infinite series of naive date objects in our timezone, iterating backwards in time starting at the |
|
161 |
given *datetime*, or the the current date, if none given |
|
162 |
""" |
|
163 |
||
164 |
# default to now |
|
165 |
if not dt : |
|
166 |
dt = datetime.now(pytz.utc) |
|
167 |
||
168 |
# convert to target timezone |
|
169 |
dtz = dt.astimezone(self.tz) |
|
170 |
||
171 |
# our timedelta |
|
172 |
ONE_DAY = timedelta(1) |
|
173 |
||
174 |
# iterate unto infinity |
|
175 |
while True : |
|
176 |
# yield |
|
177 |
yield dtz.date() |
|
178 |
||
179 |
# one day sdrawkcab |
|
180 |
dtz -= ONE_DAY |
|
181 |
||
182 |
def get_latest (self, count) : |
|
183 |
""" |
|
184 |
Uses _iter_backwards + _get_logfile_date to read the yield the given lines from as many logfiles as needed |
|
185 |
""" |
|
186 |
||
187 |
# iterate backwards from now |
|
188 |
day_iter = self._iter_backwards() |
|
189 |
||
190 |
# number of files read |
|
191 |
files = 0 |
|
192 |
||
193 |
# only read up to 100 files or so |
|
194 |
MAX_FILES = 100 |
|
195 |
||
196 |
# loop until done |
|
197 |
while count > 0 : |
|
198 |
logfile = None |
|
199 |
||
200 |
try : |
|
201 |
# get next logfile |
|
202 |
files += 1 |
|
203 |
||
204 |
# open |
|
205 |
logfile = self._get_logfile_date(day_iter.next()) |
|
206 |
||
207 |
except IOError, e : |
|
208 |
# skip nonexistant days if we haven't found any logs yet |
|
209 |
if e.errno != errno.ENOENT : |
|
210 |
raise |
|
211 |
||
212 |
if files > MAX_FILES : |
|
213 |
raise Exception("No recent logfiles found") |
|
214 |
||
215 |
else : |
|
216 |
# skip to next day |
|
217 |
continue |
|
218 |
||
219 |
# yield lines |
|
220 |
for line in logfile.get_latest(count) : |
|
221 |
# yield while we still need to, otherwise, stop |
|
222 |
if count > 0 : |
|
223 |
# decrement |
|
224 |
count -= 1 |
|
225 |
||
226 |
yield line |
|
227 |
||
228 |
else : |
|
229 |
break |
|
230 |
||
231 |