15
|
1 |
#ifndef LIB_LEXER_H
|
|
2 |
#define LIB_LEXER_H
|
|
3 |
|
|
4 |
/*
|
|
5 |
* Simple FSM lexing
|
|
6 |
*
|
|
7 |
* The lexer is implemented as a Finite State Machine, consisting for a number of states, which then contain a set of
|
|
8 |
* transitions, which move the lexer from state to state based on each char of input at a time.
|
|
9 |
*
|
|
10 |
* Whenever the state changes, the token callback is triggered with the collected token data.
|
|
11 |
*/
|
|
12 |
|
|
13 |
#include <sys/types.h>
|
|
14 |
|
|
15 |
/*
|
|
16 |
* Transition flags
|
|
17 |
*/
|
|
18 |
enum lex_transition_flags {
|
|
19 |
LEX_TRANS_DEFAULT = 0x01,
|
|
20 |
LEX_TRANS_FINAL = 0x02,
|
|
21 |
LEX_TRANS_INVALID = 0x04,
|
|
22 |
};
|
|
23 |
|
|
24 |
/*
|
|
25 |
* A transition from one state to another.
|
|
26 |
*/
|
|
27 |
struct lex_transition {
|
|
28 |
// applies to chars [left, right]
|
|
29 |
char left, right;
|
|
30 |
|
|
31 |
// flags from lex_transition_flags
|
|
32 |
char flags;
|
|
33 |
|
|
34 |
// next state to enter
|
|
35 |
int next_state;
|
|
36 |
};
|
|
37 |
|
|
38 |
/*
|
|
39 |
* State flags
|
|
40 |
*/
|
|
41 |
enum lex_state_flags {
|
|
42 |
LEX_STATE_END = 0x01,
|
|
43 |
};
|
|
44 |
|
|
45 |
/*
|
|
46 |
* A state
|
|
47 |
*/
|
|
48 |
struct lex_state {
|
|
49 |
// the state name (for debugging)
|
|
50 |
const char *name;
|
|
51 |
|
|
52 |
// flags from lex_state_flags
|
|
53 |
char flags;
|
|
54 |
|
|
55 |
// list of transitions for this state, terminated by a transition with next_state=0
|
|
56 |
struct lex_transition trans_list[15];
|
|
57 |
};
|
|
58 |
|
|
59 |
/*
|
|
60 |
* Special tokens
|
|
61 |
*/
|
|
62 |
|
|
63 |
// shows up in token_fn as the value of next_token when this_token is the last token.
|
|
64 |
#define LEX_EOF 0
|
|
65 |
|
|
66 |
/*
|
|
67 |
* Lex machine
|
|
68 |
*/
|
|
69 |
struct lex {
|
|
70 |
/*
|
|
71 |
* Core token handler. Everytime a full token is lexed (i.e. the state changes), this will be called.
|
|
72 |
* `this_token` represents the full token that was parsed, and `token_data` is the token's value. `next_token`
|
|
73 |
* is the state that terminated this token, and `prev_token` was the token before this one.
|
|
74 |
*
|
|
75 |
* `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be
|
|
76 |
* modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it.
|
|
77 |
*
|
|
78 |
* Return zero to have lexing continue, nonzero to stop lexing.
|
|
79 |
*/
|
|
80 |
int (*token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg);
|
|
81 |
|
|
82 |
/*
|
|
83 |
* Called on every char handled by the lexer. `this_token` is the state of the token that the char belongs to.
|
|
84 |
*
|
|
85 |
* Return zero to have lexing continue, nonzero to stop lexing.
|
|
86 |
*/
|
|
87 |
int (*char_fn) (int this_token, char token_char, void *arg);
|
|
88 |
|
|
89 |
/*
|
|
90 |
* Called when the end of input has been reached, `last_token` is the state that we terminated in.
|
|
91 |
*
|
|
92 |
* Return zero to indiciate that the input was valid, nonzero to indicate an error.
|
|
93 |
*/
|
|
94 |
int (*end_fn) (int last_token, void *arg);
|
|
95 |
|
|
96 |
// number of states
|
|
97 |
size_t state_count;
|
|
98 |
|
|
99 |
// array of lex_states, indexable by the state id.
|
|
100 |
struct lex_state state_list[];
|
|
101 |
};
|
|
102 |
|
|
103 |
/*
|
|
104 |
* Helper macros for building the state_list
|
|
105 |
*/
|
|
106 |
#define LEX_STATE(enum_val) { #enum_val, 0,
|
|
107 |
#define LEX_STATE_END(enum_val) { #enum_val, LEX_STATE_END,
|
|
108 |
|
|
109 |
#define LEX_CHAR(c, to) { c, c, 0, to }
|
|
110 |
#define LEX_RANGE(l, r, to) { l, r, 0, to }
|
|
111 |
#define LEX_ALPHA(to) LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to)
|
|
112 |
#define LEX_NUMBER(to) LEX_RANGE('0', '9', to)
|
|
113 |
#define LEX_ALNUM(to) LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to)
|
|
114 |
#define LEX_WHITESPACE(to) LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to)
|
|
115 |
#define LEX_INVALID(c) { c, c, LEX_TRANS_INVALID, 0 }
|
|
116 |
|
|
117 |
#define LEX_DEFAULT(to) { 0, 0, LEX_TRANS_DEFAULT, to } \
|
|
118 |
}
|
|
119 |
#define LEX_END { 0, 0, 0, 0 } \
|
|
120 |
}
|
|
121 |
|
|
122 |
/*
|
|
123 |
* Lex it!
|
|
124 |
*
|
|
125 |
* Return zero to indiciate that the input was valid, nonzero otherwise.
|
|
126 |
*/
|
|
127 |
int lexer (const struct lex *lex, const char *input, void *arg);
|
|
128 |
|
|
129 |
#endif /* LIB_LEXER_H */
|