| author | Tero Marttila <terom@fixme.fi> |
| Tue, 07 Oct 2008 20:31:35 +0300 | |
| changeset 14 | 115067dfba55 |
| parent 13 | 385b9a10d096 |
| permissions | -rw-r--r-- |
|
13
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
1 |
#ifndef LIB_LEXER_H |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
2 |
#define LIB_LEXER_H |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
3 |
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
4 |
/* |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
5 |
* Simple FSM lexing |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
6 |
* |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
7 |
* The lexer is implemented as a Finite State Machine, consisting for a number of states, which then contain a set of |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
8 |
* transitions, which move the lexer from state to state based on each char of input at a time. |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
9 |
* |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
10 |
* Whenever the state changes, the token callback is triggered with the collected token data. |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
11 |
*/ |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
12 |
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
13 |
/* |
| 14 | 14 |
* Transition flags |
15 |
*/ |
|
16 |
enum lex_transition_flags {
|
|
17 |
LEX_TRANS_DEFAULT = 0x01, |
|
18 |
LEX_TRANS_FINAL = 0x02, |
|
19 |
}; |
|
20 |
||
21 |
/* |
|
|
13
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
22 |
* A transition from one state to another. |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
23 |
*/ |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
24 |
struct lex_transition {
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
25 |
// applies to chars [left, right] |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
26 |
char left, right; |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
27 |
|
| 14 | 28 |
// flags from lex_transition_flags |
29 |
char flags; |
|
30 |
||
|
13
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
31 |
// next state to enter |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
32 |
int next_state; |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
33 |
}; |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
34 |
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
35 |
/* |
| 14 | 36 |
* State flags |
37 |
*/ |
|
38 |
enum lex_state_flags {
|
|
39 |
LEX_STATE_END = 0x01; |
|
40 |
}; |
|
41 |
||
42 |
/* |
|
|
13
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
43 |
* A state |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
44 |
*/ |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
45 |
struct lex_state {
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
46 |
// the state name (for debugging) |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
47 |
const char *name; |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
48 |
|
| 14 | 49 |
// flags from lex_state_flags |
50 |
char flags; |
|
51 |
||
|
13
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
52 |
// list of transitions for this state, terminated by a transition with next_state=0 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
53 |
struct lex_transition *trans_list; |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
54 |
}; |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
55 |
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
56 |
/* |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
57 |
* Lex machine |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
58 |
*/ |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
59 |
struct lex {
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
60 |
// number of states |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
61 |
size_t state_count; |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
62 |
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
63 |
// array of lex_states, indexable by the state id. |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
64 |
struct lex_state *state_list; |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
65 |
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
66 |
/* |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
67 |
* Core token handler. Everytime a full token is lexed (i.e. the state changes), this will be called. |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
68 |
* `this_token` represents the full token that was parsed, and `token_data` is the token's value. `next_token` |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
69 |
* is the state that terminated this token, and `prev_token` was the token before this one. |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
70 |
* |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
71 |
* `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
72 |
* modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it. |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
73 |
* |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
74 |
* Return zero to have lexing continue, nonzero to stop lexing. |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
75 |
*/ |
| 14 | 76 |
int (*token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg); |
|
13
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
77 |
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
78 |
/* |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
79 |
* Called on every char handled by the lexer. `this_token` is the state of the token that the char belongs to. |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
80 |
* |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
81 |
* Return zero to have lexing continue, nonzero to stop lexing. |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
82 |
*/ |
| 14 | 83 |
int (*char_fn) (int this_token, char token_char, void *arg); |
|
13
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
84 |
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
85 |
/* |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
86 |
* Called when the end of input has been reached, `last_token` is the state that we terminated in. |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
87 |
* |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
88 |
* Return zero to indiciate that the input was valid, nonzero to indicate an error. |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
89 |
*/ |
| 14 | 90 |
int (*end_fn) (int last_token, void *arg); |
|
13
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
91 |
}; |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
92 |
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
93 |
/* |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
94 |
* Helper macros for building the state_list |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
95 |
*/ |
| 14 | 96 |
#define LEX_STATE(enum_val) { #enum_val, 0,
|
97 |
#define LEX_STATE_END(enum_val) { #enum_val, LEX_STATE_END,
|
|
|
13
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
98 |
|
| 14 | 99 |
#define LEX_CHAR(c, to) { c, c, 0, to },
|
100 |
#define LEX_RANGE(l, r, to) { l, r, 0, to },
|
|
|
13
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
101 |
#define LEX_ALPHA(to) LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to)
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
102 |
#define LEX_NUMBER(to) LEX_RANGE('0', '9', to)
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
103 |
#define LEX_ALNUM(to) LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to)
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
104 |
#define LEX_WHITESPACE(to) LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to)
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
105 |
|
| 14 | 106 |
#define LEX_DEFAULT(to) { 0, 0, LEX_TRANS_DEFAULT, to } \
|
107 |
} |
|
108 |
#define LEX_END { 0, 0, 0, 0 } \
|
|
109 |
} |
|
|
13
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
110 |
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
111 |
/* |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
112 |
* Lex it! |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
113 |
* |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
114 |
* Return zero to indiciate that the input was valid, nonzero otherwise. |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
115 |
*/ |
| 14 | 116 |
int lexer (const struct lex *lex, const char *input, void *arg); |
|
13
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
117 |
|
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
118 |
#endif /* LIB_LEXER_H */ |