author | Tero Marttila <terom@fixme.fi> |
Tue, 07 Oct 2008 18:38:03 +0300 | |
changeset 13 | 385b9a10d096 |
child 14 | 115067dfba55 |
permissions | -rw-r--r-- |
13
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
1 |
#ifndef LIB_LEXER_H |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
2 |
#define LIB_LEXER_H |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
3 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
4 |
/* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
5 |
* Simple FSM lexing |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
6 |
* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
7 |
* The lexer is implemented as a Finite State Machine, consisting for a number of states, which then contain a set of |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
8 |
* transitions, which move the lexer from state to state based on each char of input at a time. |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
9 |
* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
10 |
* Whenever the state changes, the token callback is triggered with the collected token data. |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
11 |
*/ |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
12 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
13 |
/* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
14 |
* A transition from one state to another. |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
15 |
*/ |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
16 |
struct lex_transition { |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
17 |
// applies to chars [left, right] |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
18 |
char left, right; |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
19 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
20 |
// next state to enter |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
21 |
int next_state; |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
22 |
}; |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
23 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
24 |
/* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
25 |
* A state |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
26 |
*/ |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
27 |
struct lex_state { |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
28 |
// the state name (for debugging) |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
29 |
const char *name; |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
30 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
31 |
// list of transitions for this state, terminated by a transition with next_state=0 |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
32 |
struct lex_transition *trans_list; |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
33 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
34 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
35 |
}; |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
36 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
37 |
/* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
38 |
* Lex machine |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
39 |
*/ |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
40 |
struct lex { |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
41 |
// number of states |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
42 |
size_t state_count; |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
43 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
44 |
// array of lex_states, indexable by the state id. |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
45 |
struct lex_state *state_list; |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
46 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
47 |
/* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
48 |
* Core token handler. Everytime a full token is lexed (i.e. the state changes), this will be called. |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
49 |
* `this_token` represents the full token that was parsed, and `token_data` is the token's value. `next_token` |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
50 |
* is the state that terminated this token, and `prev_token` was the token before this one. |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
51 |
* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
52 |
* `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
53 |
* modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it. |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
54 |
* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
55 |
* Return zero to have lexing continue, nonzero to stop lexing. |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
56 |
*/ |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
57 |
int (*lex_token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg); |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
58 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
59 |
/* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
60 |
* Called on every char handled by the lexer. `this_token` is the state of the token that the char belongs to. |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
61 |
* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
62 |
* Return zero to have lexing continue, nonzero to stop lexing. |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
63 |
*/ |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
64 |
int (*lex_char_fn) (int this_token, char token_char, void *arg); |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
65 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
66 |
/* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
67 |
* Called when the end of input has been reached, `last_token` is the state that we terminated in. |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
68 |
* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
69 |
* Return zero to indiciate that the input was valid, nonzero to indicate an error. |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
70 |
*/ |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
71 |
int (*lex_end_fn) (int last_token, void *arg); |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
72 |
}; |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
73 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
74 |
/* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
75 |
* Helper macros for building the state_list |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
76 |
*/ |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
77 |
#define LEX_STATE(enum_val) { #enum_val, { |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
78 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
79 |
#define LEX_CHAR(c, to) { c, c, to }, |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
80 |
#define LEX_RANGE(l, r, to) { l, r, to }, |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
81 |
#define LEX_ALPHA(to) LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to) |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
82 |
#define LEX_NUMBER(to) LEX_RANGE('0', '9', to) |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
83 |
#define LEX_ALNUM(to) LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to) |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
84 |
#define LEX_WHITESPACE(to) LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to) |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
85 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
86 |
#define LEX_STATE_END {0, 0, 0} \ |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
87 |
} } |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
88 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
89 |
/* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
90 |
* Lex it! |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
91 |
* |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
92 |
* Return zero to indiciate that the input was valid, nonzero otherwise. |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
93 |
*/ |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
94 |
int lexer (struct lex *lex, const char *input, void *arg); |
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
95 |
|
385b9a10d096
inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff
changeset
|
96 |
#endif /* LIB_LEXER_H */ |