15
|
1 |
#ifndef LIB_LEXER_H
|
|
2 |
#define LIB_LEXER_H
|
|
3 |
|
|
4 |
/*
|
|
5 |
* Simple FSM lexing
|
|
6 |
*
|
|
7 |
* The lexer is implemented as a Finite State Machine, consisting for a number of states, which then contain a set of
|
|
8 |
* transitions, which move the lexer from state to state based on each char of input at a time.
|
|
9 |
*
|
|
10 |
* Whenever the state changes, the token callback is triggered with the collected token data.
|
|
11 |
*/
|
|
12 |
|
|
13 |
#include <sys/types.h>
|
|
14 |
|
|
15 |
/*
|
|
16 |
* Transition flags
|
|
17 |
*/
|
|
18 |
enum lex_transition_flags {
|
|
19 |
LEX_TRANS_DEFAULT = 0x01,
|
16
|
20 |
/* not supported
|
|
21 |
LEX_TRANS_FINAL = 0x02, */
|
15
|
22 |
LEX_TRANS_INVALID = 0x04,
|
|
23 |
};
|
|
24 |
|
|
25 |
/*
|
|
26 |
* A transition from one state to another.
|
|
27 |
*/
|
|
28 |
struct lex_transition {
|
|
29 |
// applies to chars [left, right]
|
|
30 |
char left, right;
|
|
31 |
|
|
32 |
// flags from lex_transition_flags
|
|
33 |
char flags;
|
|
34 |
|
|
35 |
// next state to enter
|
|
36 |
int next_state;
|
|
37 |
};
|
|
38 |
|
|
39 |
/*
|
|
40 |
* State flags
|
|
41 |
*/
|
|
42 |
enum lex_state_flags {
|
|
43 |
LEX_STATE_END = 0x01,
|
|
44 |
};
|
|
45 |
|
|
46 |
/*
|
|
47 |
* A state
|
|
48 |
*/
|
|
49 |
struct lex_state {
|
|
50 |
// the state name (for debugging)
|
|
51 |
const char *name;
|
|
52 |
|
|
53 |
// flags from lex_state_flags
|
|
54 |
char flags;
|
|
55 |
|
|
56 |
// list of transitions for this state, terminated by a transition with next_state=0
|
|
57 |
struct lex_transition trans_list[15];
|
|
58 |
};
|
|
59 |
|
|
60 |
/*
|
16
|
61 |
* Special states, these are all defined as zero
|
15
|
62 |
*/
|
|
63 |
|
|
64 |
// shows up in token_fn as the value of next_token when this_token is the last token.
|
|
65 |
#define LEX_EOF 0
|
|
66 |
|
16
|
67 |
// shows up as the initial value of prev_token
|
|
68 |
#define LEX_INITIAL 0
|
|
69 |
|
15
|
70 |
/*
|
|
71 |
* Lex machine
|
|
72 |
*/
|
|
73 |
struct lex {
|
|
74 |
/*
|
|
75 |
* Core token handler. Everytime a full token is lexed (i.e. the state changes), this will be called.
|
|
76 |
* `this_token` represents the full token that was parsed, and `token_data` is the token's value. `next_token`
|
|
77 |
* is the state that terminated this token, and `prev_token` was the token before this one.
|
|
78 |
*
|
|
79 |
* `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be
|
|
80 |
* modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it.
|
|
81 |
*
|
|
82 |
* Return zero to have lexing continue, nonzero to stop lexing.
|
|
83 |
*/
|
|
84 |
int (*token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg);
|
|
85 |
|
|
86 |
/*
|
16
|
87 |
* Called on every char handled by the lexer.
|
|
88 |
*
|
|
89 |
* The NUL byte at the end of the input string is not passed to char_fn (why not?).
|
15
|
90 |
*
|
|
91 |
* Return zero to have lexing continue, nonzero to stop lexing.
|
|
92 |
*/
|
16
|
93 |
int (*char_fn) (char token_char, int from_token, int to_token, void *arg);
|
15
|
94 |
|
|
95 |
/*
|
|
96 |
* Called when the end of input has been reached, `last_token` is the state that we terminated in.
|
|
97 |
*
|
|
98 |
* Return zero to indiciate that the input was valid, nonzero to indicate an error.
|
|
99 |
*/
|
|
100 |
int (*end_fn) (int last_token, void *arg);
|
|
101 |
|
|
102 |
// number of states
|
|
103 |
size_t state_count;
|
|
104 |
|
16
|
105 |
// initial state
|
|
106 |
int initial_state;
|
|
107 |
|
15
|
108 |
// array of lex_states, indexable by the state id.
|
|
109 |
struct lex_state state_list[];
|
|
110 |
};
|
|
111 |
|
|
112 |
/*
|
|
113 |
* Helper macros for building the state_list
|
|
114 |
*/
|
|
115 |
#define LEX_STATE(enum_val) { #enum_val, 0,
|
|
116 |
#define LEX_STATE_END(enum_val) { #enum_val, LEX_STATE_END,
|
|
117 |
|
|
118 |
#define LEX_CHAR(c, to) { c, c, 0, to }
|
|
119 |
#define LEX_RANGE(l, r, to) { l, r, 0, to }
|
|
120 |
#define LEX_ALPHA(to) LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to)
|
|
121 |
#define LEX_NUMBER(to) LEX_RANGE('0', '9', to)
|
|
122 |
#define LEX_ALNUM(to) LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to)
|
|
123 |
#define LEX_WHITESPACE(to) LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to)
|
|
124 |
#define LEX_INVALID(c) { c, c, LEX_TRANS_INVALID, 0 }
|
|
125 |
|
|
126 |
#define LEX_DEFAULT(to) { 0, 0, LEX_TRANS_DEFAULT, to } \
|
|
127 |
}
|
|
128 |
#define LEX_END { 0, 0, 0, 0 } \
|
|
129 |
}
|
|
130 |
|
|
131 |
/*
|
16
|
132 |
* Helpers for handling states
|
|
133 |
*/
|
|
134 |
#define LEX_STATE_NAME(lex, state) ((state) ? (lex)->state_list[(state) - 1].name : "...")
|
|
135 |
|
|
136 |
/*
|
15
|
137 |
* Lex it!
|
|
138 |
*
|
|
139 |
* Return zero to indiciate that the input was valid, nonzero otherwise.
|
|
140 |
*/
|
|
141 |
int lexer (const struct lex *lex, const char *input, void *arg);
|
|
142 |
|
|
143 |
#endif /* LIB_LEXER_H */
|