|
1 #ifndef LIB_LEXER_H |
|
2 #define LIB_LEXER_H |
|
3 |
|
4 /* |
|
5 * Simple FSM lexing |
|
6 * |
|
7 * The lexer is implemented as a Finite State Machine, consisting for a number of states, which then contain a set of |
|
8 * transitions, which move the lexer from state to state based on each char of input at a time. |
|
9 * |
|
10 * Whenever the state changes, the token callback is triggered with the collected token data. |
|
11 */ |
|
12 |
|
13 /* |
|
14 * A transition from one state to another. |
|
15 */ |
|
16 struct lex_transition { |
|
17 // applies to chars [left, right] |
|
18 char left, right; |
|
19 |
|
20 // next state to enter |
|
21 int next_state; |
|
22 }; |
|
23 |
|
24 /* |
|
25 * A state |
|
26 */ |
|
27 struct lex_state { |
|
28 // the state name (for debugging) |
|
29 const char *name; |
|
30 |
|
31 // list of transitions for this state, terminated by a transition with next_state=0 |
|
32 struct lex_transition *trans_list; |
|
33 |
|
34 |
|
35 }; |
|
36 |
|
37 /* |
|
38 * Lex machine |
|
39 */ |
|
40 struct lex { |
|
41 // number of states |
|
42 size_t state_count; |
|
43 |
|
44 // array of lex_states, indexable by the state id. |
|
45 struct lex_state *state_list; |
|
46 |
|
47 /* |
|
48 * Core token handler. Everytime a full token is lexed (i.e. the state changes), this will be called. |
|
49 * `this_token` represents the full token that was parsed, and `token_data` is the token's value. `next_token` |
|
50 * is the state that terminated this token, and `prev_token` was the token before this one. |
|
51 * |
|
52 * `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be |
|
53 * modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it. |
|
54 * |
|
55 * Return zero to have lexing continue, nonzero to stop lexing. |
|
56 */ |
|
57 int (*lex_token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg); |
|
58 |
|
59 /* |
|
60 * Called on every char handled by the lexer. `this_token` is the state of the token that the char belongs to. |
|
61 * |
|
62 * Return zero to have lexing continue, nonzero to stop lexing. |
|
63 */ |
|
64 int (*lex_char_fn) (int this_token, char token_char, void *arg); |
|
65 |
|
66 /* |
|
67 * Called when the end of input has been reached, `last_token` is the state that we terminated in. |
|
68 * |
|
69 * Return zero to indiciate that the input was valid, nonzero to indicate an error. |
|
70 */ |
|
71 int (*lex_end_fn) (int last_token, void *arg); |
|
72 }; |
|
73 |
|
74 /* |
|
75 * Helper macros for building the state_list |
|
76 */ |
|
77 #define LEX_STATE(enum_val) { #enum_val, { |
|
78 |
|
79 #define LEX_CHAR(c, to) { c, c, to }, |
|
80 #define LEX_RANGE(l, r, to) { l, r, to }, |
|
81 #define LEX_ALPHA(to) LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to) |
|
82 #define LEX_NUMBER(to) LEX_RANGE('0', '9', to) |
|
83 #define LEX_ALNUM(to) LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to) |
|
84 #define LEX_WHITESPACE(to) LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to) |
|
85 |
|
86 #define LEX_STATE_END {0, 0, 0} \ |
|
87 } } |
|
88 |
|
89 /* |
|
90 * Lex it! |
|
91 * |
|
92 * Return zero to indiciate that the input was valid, nonzero otherwise. |
|
93 */ |
|
94 int lexer (struct lex *lex, const char *input, void *arg); |
|
95 |
|
96 #endif /* LIB_LEXER_H */ |