terom@15: #ifndef LIB_LEXER_H terom@15: #define LIB_LEXER_H terom@15: terom@15: /* terom@15: * Simple FSM lexing terom@15: * terom@15: * The lexer is implemented as a Finite State Machine, consisting for a number of states, which then contain a set of terom@15: * transitions, which move the lexer from state to state based on each char of input at a time. terom@15: * terom@15: * Whenever the state changes, the token callback is triggered with the collected token data. terom@15: */ terom@15: terom@15: #include terom@15: terom@15: /* terom@15: * Transition flags terom@15: */ terom@15: enum lex_transition_flags { terom@15: LEX_TRANS_DEFAULT = 0x01, terom@16: /* not supported terom@16: LEX_TRANS_FINAL = 0x02, */ terom@15: LEX_TRANS_INVALID = 0x04, terom@15: }; terom@15: terom@15: /* terom@15: * A transition from one state to another. terom@15: */ terom@15: struct lex_transition { terom@15: // applies to chars [left, right] terom@15: char left, right; terom@15: terom@15: // flags from lex_transition_flags terom@15: char flags; terom@15: terom@15: // next state to enter terom@15: int next_state; terom@15: }; terom@15: terom@15: /* terom@15: * State flags terom@15: */ terom@15: enum lex_state_flags { terom@15: LEX_STATE_END = 0x01, terom@15: }; terom@15: terom@15: /* terom@15: * A state terom@15: */ terom@15: struct lex_state { terom@15: // the state name (for debugging) terom@15: const char *name; terom@15: terom@15: // flags from lex_state_flags terom@15: char flags; terom@15: terom@15: // list of transitions for this state, terminated by a transition with next_state=0 terom@15: struct lex_transition trans_list[15]; terom@15: }; terom@15: terom@15: /* terom@16: * Special states, these are all defined as zero terom@15: */ terom@15: terom@15: // shows up in token_fn as the value of next_token when this_token is the last token. terom@15: #define LEX_EOF 0 terom@15: terom@16: // shows up as the initial value of prev_token terom@16: #define LEX_INITIAL 0 terom@16: terom@15: /* terom@15: * Lex machine terom@15: */ terom@15: struct lex { terom@15: /* terom@15: * Core token handler. Everytime a full token is lexed (i.e. the state changes), this will be called. terom@15: * `this_token` represents the full token that was parsed, and `token_data` is the token's value. `next_token` terom@15: * is the state that terminated this token, and `prev_token` was the token before this one. terom@15: * terom@15: * `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be terom@15: * modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it. terom@15: * terom@15: * Return zero to have lexing continue, nonzero to stop lexing. terom@15: */ terom@15: int (*token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg); terom@15: terom@15: /* terom@16: * Called on every char handled by the lexer. terom@16: * terom@16: * The NUL byte at the end of the input string is not passed to char_fn (why not?). terom@15: * terom@15: * Return zero to have lexing continue, nonzero to stop lexing. terom@15: */ terom@16: int (*char_fn) (char token_char, int from_token, int to_token, void *arg); terom@15: terom@15: /* terom@15: * Called when the end of input has been reached, `last_token` is the state that we terminated in. terom@15: * terom@15: * Return zero to indiciate that the input was valid, nonzero to indicate an error. terom@15: */ terom@15: int (*end_fn) (int last_token, void *arg); terom@15: terom@15: // number of states terom@15: size_t state_count; terom@15: terom@16: // initial state terom@16: int initial_state; terom@16: terom@15: // array of lex_states, indexable by the state id. terom@15: struct lex_state state_list[]; terom@15: }; terom@15: terom@15: /* terom@15: * Helper macros for building the state_list terom@15: */ terom@15: #define LEX_STATE(enum_val) { #enum_val, 0, terom@15: #define LEX_STATE_END(enum_val) { #enum_val, LEX_STATE_END, terom@15: terom@15: #define LEX_CHAR(c, to) { c, c, 0, to } terom@15: #define LEX_RANGE(l, r, to) { l, r, 0, to } terom@15: #define LEX_ALPHA(to) LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to) terom@15: #define LEX_NUMBER(to) LEX_RANGE('0', '9', to) terom@15: #define LEX_ALNUM(to) LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to) terom@15: #define LEX_WHITESPACE(to) LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to) terom@15: #define LEX_INVALID(c) { c, c, LEX_TRANS_INVALID, 0 } terom@15: terom@15: #define LEX_DEFAULT(to) { 0, 0, LEX_TRANS_DEFAULT, to } \ terom@15: } terom@15: #define LEX_END { 0, 0, 0, 0 } \ terom@15: } terom@15: terom@15: /* terom@16: * Helpers for handling states terom@16: */ terom@16: #define LEX_STATE_NAME(lex, state) ((state) ? (lex)->state_list[(state) - 1].name : "...") terom@16: terom@16: /* terom@15: * Lex it! terom@15: * terom@15: * Return zero to indiciate that the input was valid, nonzero otherwise. terom@15: */ terom@15: int lexer (const struct lex *lex, const char *input, void *arg); terom@15: terom@15: #endif /* LIB_LEXER_H */