tree_parse.py
author Tero Marttila <terom@fixme.fi>
Mon, 16 Feb 2009 19:08:17 +0200
changeset 78 a46d2fc07951
parent 77 bef7196f7682
permissions -rw-r--r--
add test for tree_parse filesystem stuff

"""
    Parsing trees of node stored using a python-like syntax.

    A file consists of a number of lines, and each line consists of indenting whitespace and data. Each line has a parent
"""

class TreeParseError (Exception) :
    """
        Error parsing a tree file
    """

    pass

def _read_lines (file, stop_tokens=None, charset='utf8') :
    """
        Reads lines from the given path, ignoring empty lines, and yielding (line_number, indent, line) tuples, where 
        line_number is the line number, indent counts the amount of leading whitespace, and line is the actual line
        data with whitespace stripped.

        File is either a str with the filesystem path, or a something that supports iterating over lines. 
        
        Charset is what to decode the lines with, or pass None to not decode.

        Stop tokens is a list of chars to stop counting indentation on - if such a line begins with such a char, its
        indentation is taken as zero. Ignored if empty.
    """
    
    # open path?
    if isinstance(file, str) :
        file = open(file, 'rb')
    
    # iterate over lines
    for line_number, line in enumerate(file) :
        if charset :
            # decode to unicode
            line = line.decode(charset)

        indent = 0

        # count indent
        for char in line :
            # tabs break things
            assert char != '\t'
            
            # increment up to first non-space char
            if char == ' ' :
                indent += 1
            
            elif stop_tokens and char in stop_tokens :
                # consider line as not having any indentation at all
                indent = 0
                break

            else :
                break
        
        # strip whitespace
        line = line.strip()

        # ignore empty lines
        if not line :
            continue

        # yield
        yield line_number + 1, indent, line

def parse (file, stop_tokens='', charset='utf8') :
    """
        Reads and parses the file at the given path, returning a list of (line_number, line, children) tuples.
    """

    # stack of (indent, PageInfo) items
    stack = []

    # the root item
    root = None

    # the previous item processed, None for first one
    prev = None
    
    # read lines
    for line_number, indent, line in _read_lines(file, stop_tokens, charset) :
        # create item
        item = (line_number, line, [])

        # are we the first item?
        if not prev :
            # root node does not have a parent
            parent = None
            
            # set root
            root = item

            # initialize stack
            stack.append((0, root))
            
        else :
            # peek stack
            stack_indent, stack_parent = stack[-1]

            # new indent level?
            if indent > stack_indent :
                # set parent to previous item, and push new indent level + parent to stack
                parent = prev

                # push new indent level + its parent
                stack.append((indent, parent))

            # same indent level as previous
            elif indent == stack_indent :
                # parent is the one of the current stack level, stack doesn't change
                parent = stack_parent
            
            # unravel stack
            elif indent < stack_indent :
                while True :
                    # remove current stack level
                    stack.pop(-1)

                    # peek next level
                    stack_indent, stack_parent = stack[-1]
                    
                    # found the level to return to?
                    if stack_indent == indent :
                        # restore prev
                        parent = stack_parent

                        break

                    elif stack_indent < indent :
                        raise TreeParseError("Bad unindent on %s:%d, %d < %d" % (file, line_number, stack_indent, indent))
        
        # add to parent?
        if parent :
            parent[2].append(item)

        # update prev
        prev = item
    
    # return the root
    return root