1 #ifndef LIB_LEXER_H |
|
2 #define LIB_LEXER_H |
|
3 |
|
4 /* |
|
5 * Simple FSM lexing |
|
6 * |
|
7 * The lexer is implemented as a Finite State Machine, consisting for a number of states, which then contain a set of |
|
8 * transitions, which move the lexer from state to state based on each char of input at a time. |
|
9 * |
|
10 * Whenever the state changes, the token callback is triggered with the collected token data. |
|
11 */ |
|
12 |
|
13 #include <sys/types.h> |
|
14 |
|
15 /* |
|
16 * Transition flags |
|
17 */ |
|
18 enum lex_transition_flags { |
|
19 LEX_TRANS_DEFAULT = 0x01, |
|
20 /* not supported |
|
21 LEX_TRANS_FINAL = 0x02, */ |
|
22 LEX_TRANS_INVALID = 0x04, |
|
23 }; |
|
24 |
|
25 /* |
|
26 * A transition from one state to another. |
|
27 */ |
|
28 struct lex_transition { |
|
29 // applies to chars [left, right] |
|
30 char left, right; |
|
31 |
|
32 // flags from lex_transition_flags |
|
33 char flags; |
|
34 |
|
35 // next state to enter |
|
36 int next_state; |
|
37 }; |
|
38 |
|
39 /* |
|
40 * State flags |
|
41 */ |
|
42 enum lex_state_flags { |
|
43 LEX_STATE_END = 0x01, |
|
44 }; |
|
45 |
|
46 /* |
|
47 * A state |
|
48 */ |
|
49 struct lex_state { |
|
50 // the state name (for debugging) |
|
51 const char *name; |
|
52 |
|
53 // flags from lex_state_flags |
|
54 char flags; |
|
55 |
|
56 // list of transitions for this state, terminated by a transition with next_state=0 |
|
57 struct lex_transition trans_list[15]; |
|
58 }; |
|
59 |
|
60 /* |
|
61 * Special states, these are all defined as zero |
|
62 */ |
|
63 |
|
64 // shows up in token_fn as the value of next_token when this_token is the last token. |
|
65 #define LEX_EOF 0 |
|
66 |
|
67 // shows up as the initial value of prev_token |
|
68 #define LEX_INITIAL 0 |
|
69 |
|
70 /* |
|
71 * Lex machine |
|
72 */ |
|
73 struct lex { |
|
74 /* |
|
75 * Core token handler. Everytime a full token is lexed (i.e. the state changes), this will be called. |
|
76 * `this_token` represents the full token that was parsed, and `token_data` is the token's value. `next_token` |
|
77 * is the state that terminated this token, and `prev_token` was the token before this one. |
|
78 * |
|
79 * `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be |
|
80 * modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it. |
|
81 * |
|
82 * Return zero to have lexing continue, nonzero to stop lexing. |
|
83 */ |
|
84 int (*token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg); |
|
85 |
|
86 /* |
|
87 * Called on every char handled by the lexer. |
|
88 * |
|
89 * The NUL byte at the end of the input string is not passed to char_fn (why not?). |
|
90 * |
|
91 * Return zero to have lexing continue, nonzero to stop lexing. |
|
92 */ |
|
93 int (*char_fn) (char token_char, int from_token, int to_token, void *arg); |
|
94 |
|
95 /* |
|
96 * Called when the end of input has been reached, `last_token` is the state that we terminated in. |
|
97 * |
|
98 * Return zero to indiciate that the input was valid, nonzero to indicate an error. |
|
99 */ |
|
100 int (*end_fn) (int last_token, void *arg); |
|
101 |
|
102 // number of states |
|
103 size_t state_count; |
|
104 |
|
105 // initial state |
|
106 int initial_state; |
|
107 |
|
108 // array of lex_states, indexable by the state id. |
|
109 struct lex_state state_list[]; |
|
110 }; |
|
111 |
|
112 /* |
|
113 * Helper macros for building the state_list |
|
114 */ |
|
115 #define LEX_STATE(enum_val) { #enum_val, 0, |
|
116 #define LEX_STATE_END(enum_val) { #enum_val, LEX_STATE_END, |
|
117 |
|
118 #define LEX_CHAR(c, to) { c, c, 0, to } |
|
119 #define LEX_RANGE(l, r, to) { l, r, 0, to } |
|
120 #define LEX_ALPHA(to) LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to) |
|
121 #define LEX_NUMBER(to) LEX_RANGE('0', '9', to) |
|
122 #define LEX_ALNUM(to) LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to) |
|
123 #define LEX_WHITESPACE(to) LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to) |
|
124 #define LEX_INVALID(c) { c, c, LEX_TRANS_INVALID, 0 } |
|
125 |
|
126 #define LEX_DEFAULT(to) { 0, 0, LEX_TRANS_DEFAULT, to } \ |
|
127 } |
|
128 #define LEX_END { 0, 0, 0, 0 } \ |
|
129 } |
|
130 |
|
131 /* |
|
132 * Helpers for handling states |
|
133 */ |
|
134 #define LEX_STATE_NAME(lex, state) ((state) ? (lex)->state_list[(state) - 1].name : "...") |
|
135 |
|
136 /* |
|
137 * Lex it! |
|
138 * |
|
139 * Return zero to indiciate that the input was valid, nonzero otherwise. |
|
140 */ |
|
141 int lexer (const struct lex *lex, const char *input, void *arg); |
|
142 |
|
143 #endif /* LIB_LEXER_H */ |
|