# HG changeset patch # User Tero Marttila # Date 1234299611 -7200 # Node ID 645cf9c4441ed65a1b2ef723e9544cf411178cba # Parent 0521cf830eb9824e02099ce4fbd4c1cea1cbfa79 implement full parser+formatter for irssi diff -r 0521cf830eb9 -r 645cf9c4441e config.py --- a/config.py Tue Feb 10 22:59:52 2009 +0200 +++ b/config.py Tue Feb 10 23:00:11 2009 +0200 @@ -6,7 +6,7 @@ from log_parser import IrssiParser from log_channel import LogChannel from log_source import LogSourceDecoder, LogDirectory -from log_formatter import IrssiFormatter +from log_formatter import IrssiFormatter, DebugFormatter from channels import ChannelList import log_formatter @@ -73,6 +73,7 @@ # available formatters LOG_FORMATTERS = { 'irssi': IrssiFormatter, + 'debug': DebugFormatter, } # default preferences diff -r 0521cf830eb9 -r 645cf9c4441e log_channel.py --- a/log_channel.py Tue Feb 10 22:59:52 2009 +0200 +++ b/log_channel.py Tue Feb 10 23:00:11 2009 +0200 @@ -14,10 +14,14 @@ Initialize this channel from the given identifier key, network name, channel name, and LogSource """ + # store self.id = id self.network = network self.name = name self.source = source + + # bind source + self.source.bind_channel(self) @property def title (self) : diff -r 0521cf830eb9 -r 645cf9c4441e log_formatter.py --- a/log_formatter.py Tue Feb 10 22:59:52 2009 +0200 +++ b/log_formatter.py Tue Feb 10 23:00:11 2009 +0200 @@ -53,15 +53,22 @@ else : timestamp_fmt = self.timestamp_fmt - - # build timestamp - timestamp = dtz.strftime(timestamp_fmt) + + # breakdown source + source_nickname, source_username, source_hostname, source_chanflag = line.source + target_nickname = line.target # format with dict return template % dict( - timestamp = timestamp, - source = line.source, - data = line.data, + channel_name = line.channel.name, + datetime = dtz.strftime('%a %b %d %H:%M:%S %Y'), + timestamp = dtz.strftime(timestamp_fmt), + source_nickname = source_nickname, + source_username = source_username, + source_hostname = source_hostname, + source_chanflag = source_chanflag, + target_nickname = target_nickname, + message = line.data, ) def format_txt (self, lines, full_timestamps=False) : @@ -98,11 +105,15 @@ abstract -class BaseHTMLFormatter (object) : +class BaseHTMLFormatter (LogFormatter) : """ Implements some HTML-formatting utils """ + + # parameters + html_fixedwidth = True + # regexp to match URLs URL_REGEXP = re.compile(r"http://\S+") def _process_links (self, line) : @@ -120,35 +131,7 @@ return '%(url_html)s' % dict(url_link=url_link, url_html=url_html) return self.URL_REGEXP.sub(_encode_url, line) - -class IrssiTextFormatter (RSSFormatter, PILImageFormatter, LogFormatter) : - """ - Implements format_txt for irssi-style output - """ - - # format definitions by type - __FMT = { - LogTypes.RAW : "%(timestamp)s %(data)s", - } - - def format_txt (self, lines, full_timestamps=False) : - # ...handle each line - for line in lines : - # using __TYPES - yield line, self._format_line_text(line, self.__FMT, full_timestamps) - -class IrssiFormatter (IrssiTextFormatter, BaseHTMLFormatter) : - """ - Implements plain black-and-white irssi-style formatting - """ - - # name - name = 'irssi' - title = "Irssi (plain)" - - # parameters - html_fixedwidth = True - + def format_html (self, lines, **kwargs) : """ Just uses format_txt, but processes links, etc @@ -165,6 +148,66 @@ # yield yield line, html + +class IrssiTextFormatter (RSSFormatter, PILImageFormatter, LogFormatter) : + """ + Implements format_txt for irssi-style output + """ + + # format definitions by type + __FMT = { + LogTypes.RAW : "%(timestamp)s %(data)s", + LogTypes.LOG_OPEN : "--- Log opened %(datetime)s", + LogTypes.LOG_CLOSE : "--- Log closed %(datetime)s", + + LogTypes.MSG : "%(timestamp)s <%(source_chanflag)s%(source_nickname)s> %(message)s", + LogTypes.NOTICE : "%(timestamp)s -%(source_nickname)s- %(message)s", + LogTypes.ACTION : "%(timestamp)s * %(source_nickname)s %(message)s", + + LogTypes.JOIN : "%(timestamp)s -!- %(source_nickname)s [%(source_username)s@%(source_hostname)s] has joined %(channel_name)s", + LogTypes.PART : "%(timestamp)s -!- %(source_nickname)s [%(source_username)s@%(source_hostname)s] has left %(channel_name)s [%(message)s]", + LogTypes.KICK : "%(timestamp)s -!- %(target_nickname)s was kicked from %(channel_name)s by %(source_nickname)s [%(message)s]", + LogTypes.MODE : "%(timestamp)s -!- mode/%(channel_name)s [%(message)s] by %(source_nickname)s", + + LogTypes.NICK : "%(timestamp)s -!- %(source_nickname)s is now known as %(target_nickname)s", + LogTypes.QUIT : "%(timestamp)s -!- %(source_nickname)s [%(source_username)s@%(source_hostname)s] has quit [%(message)s]", + + LogTypes.TOPIC : "%(timestamp)s -!- %(source_nickname)s changed the topic of %(channel_name)s to: %(message)s", + + LogTypes.SELF_NOTICE: "%(timestamp)s -%(source_nickname)s- %(message)s", + LogTypes.SELF_NICK : "%(timestamp)s -!- %(source_nickname)s is now known as %(target_nickname)s", + } + + def format_txt (self, lines, full_timestamps=False) : + # ...handle each line + for line in lines : + # using __TYPES + yield line, self._format_line_text(line, self.__FMT, full_timestamps) + +class IrssiFormatter (BaseHTMLFormatter, IrssiTextFormatter) : + """ + Implements plain black-and-white irssi-style formatting + """ + + # name + name = 'irssi' + title = "Irssi (plain)" + +class DebugFormatter (BaseHTMLFormatter) : + """ + Implements a raw debug-style formatting of LogLines + """ + + # name + name = 'debug' + title = "Raw debugging format" + + def format_txt (self, lines, full_timestamps=False) : + # iterate + for line in lines : + # just dump + yield line, repr(line) + def by_name (name) : """ Lookup and return a class LogFormatter by name diff -r 0521cf830eb9 -r 645cf9c4441e log_line.py --- a/log_line.py Tue Feb 10 22:59:52 2009 +0200 +++ b/log_line.py Tue Feb 10 23:00:11 2009 +0200 @@ -4,17 +4,101 @@ class LogTypes : """ - Definitions of the various LogLines types + Definitions of the various LogLines types: + + LogTypes.RAW + LogTypes.LOG_OPEN + LogTypes.LOG_CLOSE + + LogTypes.MSG + LogTypes.NOTICE + LogTypes.ACTION + + LogTypes.JOIN + LogTypes.PART + LogTypes.KICK + LogTypes.MODE + + LogTypes.NICK + LogTypes.QUIT + + LogTypes.TOPIC + + LogTypes.SELF_NOTICE + LogTypes.SELF_NICK """ + + # list of LogType values by name + LIST = [ + ## special + # unknown type, may or may not have a timestamp, no source, only data + ('RAW', 0x01), + + # log opened + ('LOG_OPEN', 0x02), + + # log closed + ('LOG_CLOSE', 0x03), + + ## messages + # normal message + ('MSG', 0x10), + + # notice + ('NOTICE', 0x11), + + # CTCP action + ('ACTION', 0x12), + + ## user-channel stats + # join + ('JOIN', 0x21), + + # part + ('PART', 0x22), + + # kick + ('KICK', 0x25), + + # channel modes + ('MODE', 0x26), + + ## user status + # nick-change + ('NICK', 0x31), + + # quit + ('QUIT', 0x32), + + ## general channel status + # topic changed + ('TOPIC', 0x41), + + ## our own actions + ('SELF_NOTICE', 0x51), + ('SELF_NICK', 0x52), + ] - # unknown type, may or may not have a timestamp, no source, only data - RAW = 0x01 + @classmethod + def name_from_code (cls, code) : + """ + Looks up a LogType name by code + """ + + return dict((type, name) for name, type in cls.LIST)[code] + +# apply as attributes +for name, code in LogTypes.LIST : + setattr(LogTypes, name, code) class LogLine (object) : """ An event on some specific channel """ + # the LogChannel + channel = None + # the offset, only garunteed to be unique for a specific channel and date offset = None @@ -24,20 +108,30 @@ # the UTC timestamp of the event timestamp = None - # the event source + # the event source, this should be a source = None + # possible event target, for certain types (kick, nick) + target = None + # associated data (message, etc) data = None - def __init__ (self, offset, type, timestamp, source, data) : + def __init__ (self, channel, offset, type, timestamp, source, target, data) : """ Initialize with given values """ + self.channel = channel self.offset = offset self.type = type self.timestamp = timestamp self.source = source + self.target = target self.data = data + + def __repr__ (self) : + return "channel=%s, offset=%s, type=%s, timestamp=%s, source=%s, target=%s, data=%s" % ( + self.channel, self.offset, LogTypes.name_from_code(self.type), self.timestamp, self.source, self.target, self.data + ) diff -r 0521cf830eb9 -r 645cf9c4441e log_parser.py --- a/log_parser.py Tue Feb 10 22:59:52 2009 +0200 +++ b/log_parser.py Tue Feb 10 23:00:11 2009 +0200 @@ -2,10 +2,18 @@ Parse log data into log_events """ +import re import datetime -import log_line -from log_line import LogTypes +from log_line import LogTypes, LogLine + +class LogParseError (Exception) : + """ + Parsing some line failed + """ + + def __init__ (self, offset, line, message) : + super(LogParseError, self).__init__("%r@%s: %s" % (line, offset, message)) class LogParser (object) : """ @@ -20,10 +28,12 @@ self.tz = tz self.timestamp_fmt = timestamp_fmt - def parse_lines (self, lines, date=None, starting_offset=None) : + def parse_lines (self, channel, lines, date=None, starting_offset=None) : """ Parse the given (iterable) lines of unicode text into a LogEvent, no trailing newline. + Channel is the LogChannel that these lines belong to. + Offset is the starting offset, and may be None to not use it. Giving date lets the parser build full timestamps, otherwise, unless line timestamps have full date @@ -32,13 +42,114 @@ abstract - class IrssiParser (LogParser) : """ A parser for irssi logfiles """ - def parse_lines (self, lines, date=None, starting_offset=None) : + # subexpression parts + _TS = r'(?P\S+)' + _NICK = r'(?P.+?)' + _NICK2 = r'(?P.+?)' + _CHAN = r'(?P.+?)' + _USERHOST = r'(?P.*?)@(?P.*?)' + _MSG = r'(?P.*)' + + # regular expressions for matching lines, by type + TYPE_EXPRS = ( + ( LogTypes.LOG_OPEN, r'--- Log opened (?P.+)' ), + ( LogTypes.LOG_CLOSE, r'--- Log closed (?P.+)' ), + ( LogTypes.MSG, _TS + r' <(?P.)' + _NICK + '> ' + _MSG ), + ( LogTypes.NOTICE, _TS + r' -' + _NICK + ':' + _CHAN + '- ' + _MSG ), + ( LogTypes.ACTION, _TS + r' \* ' + _NICK + ' ' + _MSG ), + ( LogTypes.JOIN, _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has joined ' + _CHAN ), + ( LogTypes.PART, _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has left ' + _CHAN + ' \[(?P.*?)\]' ), + ( LogTypes.KICK, _TS + r' -!- ' + _NICK2 + ' was kicked from ' + _CHAN + ' by ' + _NICK + ' \[(?P.*?)\]' ), + ( LogTypes.MODE, _TS + r' -!- mode/' + _CHAN + ' \[(?P.+?)\] by (?P\S+)' ), + ( LogTypes.NICK, _TS + r' -!- ' + _NICK + ' is now known as (?P\S+)' ), + ( LogTypes.QUIT, _TS + r' -!- ' + _NICK + ' \[' + _USERHOST + '\] has quit \[(?P.*?)\]' ), + ( LogTypes.TOPIC, _TS + r' -!- ' + _NICK + ' changed the topic of ' + _CHAN + ' to: (?P.*)' ), + + ( LogTypes.SELF_NOTICE, _TS + r' \[notice\(' + _CHAN + '\)\] ' + _MSG ), + ( LogTypes.SELF_NICK, _TS + r' -!- You\'re now known as (?P\S+)' ), + ) + + # precompile + TYPE_REGEXES = [(type, re.compile(expr)) for type, expr in TYPE_EXPRS] + + def parse_line (self, channel, line, date, offset=None) : + """ + Parse a single line, and return the resulting LogLine, or None, to ignore the line. + + Uses self.TYPE_REGEXES to do the matching + """ + + # empty line + if not line : + return + + # look for match + match = type = None + + # test each type + for type, regex in self.TYPE_REGEXES : + # attempt to match + match = regex.match(line) + + # found, break + if match : + break + + # no match found? + if not match : + raise LogParseError(offset, line, "Line did not match any type") + + # match groups + groups = match.groupdict(None) + + # parse timestamp + if 'datetime' in groups : + # parse datetime using default asctime() format + dt = datetime.datetime.strptime(groups['datetime'], '%a %b %d %H:%M:%S %Y') + + elif 'timestamp' in groups : + # parse timestamp into naive datetime + dt = datetime.datetime.strptime(groups['timestamp'], self.timestamp_fmt) + + # override date? + if date : + dt = dt.replace(year=date.year, month=date.month, day=date.day) + + else : + # no timestamp !? + raise LogParseError(offset, line, "No timestamp") + + # now localize with timezone + dtz = self.tz.localize(dt) + + # source + source = (groups.get('nickname'), groups.get('username'), groups.get('hostname'), groups.get('flags')) + + # target + target = groups.get('nickname2') + + # data + if 'message' in groups : + data = groups['message'] + + elif 'mode' in groups : + data = groups['mode'] + + elif 'topic' in groups : + data = groups['topic'] + + else : + data = None + + # build+return LogLine + return LogLine(channel, offset, type, dtz, source, target, data) + + def parse_lines (self, channel, lines, date=None, starting_offset=None) : """ Parse the given lines, yielding LogEvents. """ @@ -53,45 +164,19 @@ # try and parse try : - line = self.parse_line(line, date, offset) - + line = self.parse_line(channel, line, date, offset) + + # passthrough LogParseError's + except LogParseError : + raise + + # wrap other errors as LogParseError except Exception, e : - raise Exception("Parsing line failed: %r@%d: %s" % (line, offset, e)) + raise LogParseError(line, offset, "Parsing line failed: %s" % e) else : # yield unless None if line : yield line - def parse_line (self, line, date, offset=None) : - """ - Parse a single line, and return the resulting LogLine, or None, to ignore the line - """ - - # empty line - if not line : - return - # status lines - elif line.startswith('---') : - # XXX: handle these - return - - # normal lines - else : - # XXX: only parse timestamps for now - timestamp, data = line.split(' ', 1) - - # parse timestamp into naive datetime - dt = datetime.datetime.strptime(timestamp, self.timestamp_fmt) - - # override date? - if date : - dt = dt.replace(year=date.year, month=date.month, day=date.day) - - # now localize with timezone - dtz = self.tz.localize(dt) - - # build raw event - return log_line.LogLine(offset, LogTypes.RAW, dtz, None, data) - diff -r 0521cf830eb9 -r 645cf9c4441e log_source.py --- a/log_source.py Tue Feb 10 22:59:52 2009 +0200 +++ b/log_source.py Tue Feb 10 23:00:11 2009 +0200 @@ -46,13 +46,26 @@ A collection of IRC logs for a specific target in some format. Provides the possibility to read specific events """ - def __init__ (self, decoder) : + def __init__ (self, decoder, channel=None) : """ - Use the given LogSourceDecoder + The appropriate LogChannel must be given, as we need to be able to construct the LogLines. If it is not yet + known, then it can be given as None, and set later with bind_channel. + + Uses the given LogSourceDecoder to decode the lines. + """ + + self.channel = channel + self.decoder = decoder + + def bind_channel (self, channel) : + """ + Set this source's channel, where None was set before """ - self.decoder = decoder - + assert not self.channel + + self.channel = channel + def get_latest (self, count) : """ Yield the latest events, up to `count` of them. @@ -156,7 +169,7 @@ XXX: modify to implement LogSource? """ - def __init__ (self, path, parser, decoder, start_date=None, sep='\n') : + def __init__ (self, path, parser, decoder, channel=None, start_date=None, sep='\n') : """ Open the file at the given path, which contains lines as separated by the given separator. Lines are decoded using the given LogSourceDecoder, and then parsed using the given parser, using the given date @@ -166,6 +179,7 @@ """ # store + self.channel = channel self.path = path self.parser = parser self.start_date = start_date @@ -192,7 +206,7 @@ """ # just use our __iter__ - return self.parser.parse_lines(self, self.start_date, starting_offset=1) + return self.parser.parse_lines(self.channel, self, self.start_date, starting_offset=1) def read_from (self, dt) : """ @@ -316,22 +330,23 @@ # decode in reverse order, using our starting date.... # XXX: use lines[::-1] or reversed? # XXX: it may make more sense to parse in reverse order, using 'self.end_date' or something like that - return self.parser.parse_lines(reversed(lines), self.start_date) + return self.parser.parse_lines(self.channel, reversed(lines), self.start_date) class LogDirectory (LogSource) : """ A directory containing a series of timestamped LogFiles """ - def __init__ (self, path, tz, parser, decoder, filename_fmt) : + def __init__ (self, path, tz, parser, decoder, filename_fmt, channel=None) : """ - Load the logfiles at the given path. + Load the logfiles at the given path, which are for the given LogChannel Decode the file lines using the given decoder, the files are named according the the date in the given timezone and date format, and will be parsed using the given parser. """ # store + self.channel = channel self.path = path self.tz = tz self.parser = parser @@ -366,7 +381,7 @@ try : if load : # open+return the LogFile - return LogFile(path, self.parser, self.decoder, d) + return LogFile(path, self.parser, self.decoder, start_date=d, channel=self.channel) else : # test