|
1 #ifndef LIB_LEXER_H |
|
2 #define LIB_LEXER_H |
|
3 |
|
4 /* |
|
5 * Simple FSM lexing |
|
6 * |
|
7 * The lexer is implemented as a Finite State Machine, consisting for a number of states, which then contain a set of |
|
8 * transitions, which move the lexer from state to state based on each char of input at a time. |
|
9 * |
|
10 * Whenever the state changes, the token callback is triggered with the collected token data. |
|
11 */ |
|
12 |
|
13 #include <sys/types.h> |
|
14 |
|
15 /* |
|
16 * Transition flags |
|
17 */ |
|
18 enum lex_transition_flags { |
|
19 LEX_TRANS_DEFAULT = 0x01, |
|
20 LEX_TRANS_FINAL = 0x02, |
|
21 LEX_TRANS_INVALID = 0x04, |
|
22 }; |
|
23 |
|
24 /* |
|
25 * A transition from one state to another. |
|
26 */ |
|
27 struct lex_transition { |
|
28 // applies to chars [left, right] |
|
29 char left, right; |
|
30 |
|
31 // flags from lex_transition_flags |
|
32 char flags; |
|
33 |
|
34 // next state to enter |
|
35 int next_state; |
|
36 }; |
|
37 |
|
38 /* |
|
39 * State flags |
|
40 */ |
|
41 enum lex_state_flags { |
|
42 LEX_STATE_END = 0x01, |
|
43 }; |
|
44 |
|
45 /* |
|
46 * A state |
|
47 */ |
|
48 struct lex_state { |
|
49 // the state name (for debugging) |
|
50 const char *name; |
|
51 |
|
52 // flags from lex_state_flags |
|
53 char flags; |
|
54 |
|
55 // list of transitions for this state, terminated by a transition with next_state=0 |
|
56 struct lex_transition trans_list[15]; |
|
57 }; |
|
58 |
|
59 /* |
|
60 * Special tokens |
|
61 */ |
|
62 |
|
63 // shows up in token_fn as the value of next_token when this_token is the last token. |
|
64 #define LEX_EOF 0 |
|
65 |
|
66 /* |
|
67 * Lex machine |
|
68 */ |
|
69 struct lex { |
|
70 /* |
|
71 * Core token handler. Everytime a full token is lexed (i.e. the state changes), this will be called. |
|
72 * `this_token` represents the full token that was parsed, and `token_data` is the token's value. `next_token` |
|
73 * is the state that terminated this token, and `prev_token` was the token before this one. |
|
74 * |
|
75 * `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be |
|
76 * modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it. |
|
77 * |
|
78 * Return zero to have lexing continue, nonzero to stop lexing. |
|
79 */ |
|
80 int (*token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg); |
|
81 |
|
82 /* |
|
83 * Called on every char handled by the lexer. `this_token` is the state of the token that the char belongs to. |
|
84 * |
|
85 * Return zero to have lexing continue, nonzero to stop lexing. |
|
86 */ |
|
87 int (*char_fn) (int this_token, char token_char, void *arg); |
|
88 |
|
89 /* |
|
90 * Called when the end of input has been reached, `last_token` is the state that we terminated in. |
|
91 * |
|
92 * Return zero to indiciate that the input was valid, nonzero to indicate an error. |
|
93 */ |
|
94 int (*end_fn) (int last_token, void *arg); |
|
95 |
|
96 // number of states |
|
97 size_t state_count; |
|
98 |
|
99 // array of lex_states, indexable by the state id. |
|
100 struct lex_state state_list[]; |
|
101 }; |
|
102 |
|
103 /* |
|
104 * Helper macros for building the state_list |
|
105 */ |
|
106 #define LEX_STATE(enum_val) { #enum_val, 0, |
|
107 #define LEX_STATE_END(enum_val) { #enum_val, LEX_STATE_END, |
|
108 |
|
109 #define LEX_CHAR(c, to) { c, c, 0, to } |
|
110 #define LEX_RANGE(l, r, to) { l, r, 0, to } |
|
111 #define LEX_ALPHA(to) LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to) |
|
112 #define LEX_NUMBER(to) LEX_RANGE('0', '9', to) |
|
113 #define LEX_ALNUM(to) LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to) |
|
114 #define LEX_WHITESPACE(to) LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to) |
|
115 #define LEX_INVALID(c) { c, c, LEX_TRANS_INVALID, 0 } |
|
116 |
|
117 #define LEX_DEFAULT(to) { 0, 0, LEX_TRANS_DEFAULT, to } \ |
|
118 } |
|
119 #define LEX_END { 0, 0, 0, 0 } \ |
|
120 } |
|
121 |
|
122 /* |
|
123 * Lex it! |
|
124 * |
|
125 * Return zero to indiciate that the input was valid, nonzero otherwise. |
|
126 */ |
|
127 int lexer (const struct lex *lex, const char *input, void *arg); |
|
128 |
|
129 #endif /* LIB_LEXER_H */ |