1 |
|
2 """ |
|
3 Parsing trees of node stored using a python-like syntax. |
|
4 |
|
5 A file consists of a number of lines, and each line consists of indenting whitespace and data. Each line has a parent |
|
6 """ |
|
7 |
|
8 class TreeParseError (Exception) : |
|
9 """ |
|
10 Error parsing a tree file |
|
11 """ |
|
12 |
|
13 pass |
|
14 |
|
15 def _read_lines (path, stop_tokens, charset) : |
|
16 """ |
|
17 Reads lines from the given path, ignoring empty lines, and yielding (line_number, indent, line) tuples, where |
|
18 line_number is the line number, indent counts the amount of leading whitespace, and line is the actual line |
|
19 data with whitespace stripped. |
|
20 |
|
21 Stop tokens is a list of chars to stop counting indentation on - if such a line begins with such a char, its |
|
22 indentation is taken as zero. |
|
23 """ |
|
24 |
|
25 for line_number, line in enumerate(open(path, 'rb')) : |
|
26 # decode to unicode |
|
27 line = line.decode(charset) |
|
28 |
|
29 indent = 0 |
|
30 |
|
31 # count indent |
|
32 for char in line : |
|
33 # tabs break things |
|
34 assert char != '\t' |
|
35 |
|
36 # increment up to first non-space char |
|
37 if char == ' ' : |
|
38 indent += 1 |
|
39 |
|
40 elif char in stop_tokens : |
|
41 # consider line as not having any indentation at all |
|
42 indent = 0 |
|
43 break |
|
44 |
|
45 else : |
|
46 break |
|
47 |
|
48 # strip whitespace |
|
49 line = line.strip() |
|
50 |
|
51 # ignore empty lines |
|
52 if not line : |
|
53 continue |
|
54 |
|
55 # yield |
|
56 yield line_number + 1, indent, line |
|
57 |
|
58 def parse (path, stop_tokens='', charset='utf8') : |
|
59 """ |
|
60 Reads and parses the file at the given path, returning a list of (line_number, line, children) tuples. |
|
61 """ |
|
62 |
|
63 # stack of (indent, PageInfo) items |
|
64 stack = [] |
|
65 |
|
66 # the root item |
|
67 root = None |
|
68 |
|
69 # the previous item processed, None for first one |
|
70 prev = None |
|
71 |
|
72 # read lines |
|
73 for line_number, indent, line in _read_lines(path, stop_tokens, charset) : |
|
74 # create item |
|
75 item = (line_number, line, []) |
|
76 |
|
77 # are we the first item? |
|
78 if not prev : |
|
79 # root node does not have a parent |
|
80 parent = None |
|
81 |
|
82 # set root |
|
83 root = item |
|
84 |
|
85 # initialize stack |
|
86 stack.append((0, root)) |
|
87 |
|
88 else : |
|
89 # peek stack |
|
90 stack_indent, stack_parent = stack[-1] |
|
91 |
|
92 # new indent level? |
|
93 if indent > stack_indent : |
|
94 # set parent to previous item, and push new indent level + parent to stack |
|
95 parent = prev |
|
96 |
|
97 # push new indent level + its parent |
|
98 stack.append((indent, parent)) |
|
99 |
|
100 # same indent level as previous |
|
101 elif indent == stack_indent : |
|
102 # parent is the one of the current stack level, stack doesn't change |
|
103 parent = stack_parent |
|
104 |
|
105 # unravel stack |
|
106 elif indent < stack_indent : |
|
107 while True : |
|
108 # remove current stack level |
|
109 stack.pop(-1) |
|
110 |
|
111 # peek next level |
|
112 stack_indent, stack_parent = stack[-1] |
|
113 |
|
114 # found the level to return to? |
|
115 if stack_indent == indent : |
|
116 # restore prev |
|
117 parent = stack_parent |
|
118 |
|
119 break |
|
120 |
|
121 elif stack_indent < indent : |
|
122 raise TreeParseError("Bad unindent on %s:%d, %d < %d" % (path, line_number, stack_indent, indent)) |
|
123 |
|
124 # add to parent? |
|
125 if parent : |
|
126 parent[2].append(item) |
|
127 |
|
128 # update prev |
|
129 prev = item |
|
130 |
|
131 # return the root |
|
132 return root |
|
133 |
|