terom@15: #ifndef LIB_LEXER_H
terom@15: #define LIB_LEXER_H
terom@15: 
terom@15: /*
terom@15:  * Simple FSM lexing
terom@15:  *
terom@15:  * The lexer is implemented as a Finite State Machine, consisting for a number of states, which then contain a set of
terom@15:  * transitions, which move the lexer from state to state based on each char of input at a time.
terom@15:  *
terom@15:  * Whenever the state changes, the token callback is triggered with the collected token data.
terom@15:  */
terom@15: 
terom@15: #include <sys/types.h>
terom@15: 
terom@15: /*
terom@15:  * Transition flags
terom@15:  */
terom@15: enum lex_transition_flags {
terom@15:     LEX_TRANS_DEFAULT   = 0x01,
terom@16:     /* not supported
terom@16:     LEX_TRANS_FINAL     = 0x02, */
terom@15:     LEX_TRANS_INVALID   = 0x04,
terom@15: };
terom@15: 
terom@15: /*
terom@15:  * A transition from one state to another.
terom@15:  */
terom@15: struct lex_transition {
terom@15:     // applies to chars [left, right]
terom@15:     char left, right;
terom@15:     
terom@15:     // flags from lex_transition_flags
terom@15:     char flags;
terom@15:     
terom@15:     // next state to enter
terom@15:     int next_state;
terom@15: };
terom@15: 
terom@15: /*
terom@15:  * State flags
terom@15:  */ 
terom@15: enum lex_state_flags {
terom@15:     LEX_STATE_END       = 0x01,
terom@15: };
terom@15: 
terom@15: /*
terom@15:  * A state
terom@15:  */
terom@15: struct lex_state {
terom@15:     // the state name (for debugging)
terom@15:     const char *name;
terom@15: 
terom@15:     // flags from lex_state_flags
terom@15:     char flags;
terom@15: 
terom@15:     // list of transitions for this state, terminated by a transition with next_state=0
terom@15:     struct lex_transition trans_list[15];
terom@15: };
terom@15: 
terom@15: /*
terom@16:  * Special states, these are all defined as zero
terom@15:  */
terom@15: 
terom@15: // shows up in token_fn as the value of next_token when this_token is the last token.
terom@15: #define LEX_EOF 0
terom@15: 
terom@16: // shows up as the initial value of prev_token
terom@16: #define LEX_INITIAL 0
terom@16: 
terom@15: /*
terom@15:  * Lex machine
terom@15:  */
terom@15: struct lex {
terom@15:     /*
terom@15:      * Core token handler. Everytime a full token is lexed (i.e. the state changes), this will be called.
terom@15:      * `this_token` represents the full token that was parsed, and `token_data` is the token's value. `next_token`
terom@15:      * is the state that terminated this token, and `prev_token` was the token before this one.
terom@15:      *
terom@15:      * `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be
terom@15:      * modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it.
terom@15:      *
terom@15:      * Return zero to have lexing continue, nonzero to stop lexing.
terom@15:      */
terom@15:     int (*token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg);
terom@15: 
terom@15:     /*
terom@16:      * Called on every char handled by the lexer.
terom@16:      *
terom@16:      * The NUL byte at the end of the input string is not passed to char_fn (why not?).
terom@15:      *
terom@15:      * Return zero to have lexing continue, nonzero to stop lexing.
terom@15:      */
terom@16:     int (*char_fn) (char token_char, int from_token, int to_token, void *arg);
terom@15: 
terom@15:     /*
terom@15:      * Called when the end of input has been reached, `last_token` is the state that we terminated in.
terom@15:      *
terom@15:      * Return zero to indiciate that the input was valid, nonzero to indicate an error.
terom@15:      */
terom@15:     int (*end_fn) (int last_token, void *arg);
terom@15:     
terom@15:     // number of states
terom@15:     size_t state_count;
terom@15: 
terom@16:     // initial state
terom@16:     int initial_state;
terom@16: 
terom@15:     // array of lex_states, indexable by the state id.
terom@15:     struct lex_state state_list[];
terom@15: };
terom@15: 
terom@15: /*
terom@15:  * Helper macros for building the state_list
terom@15:  */
terom@15: #define LEX_STATE(enum_val)     { #enum_val, 0,
terom@15: #define LEX_STATE_END(enum_val) { #enum_val, LEX_STATE_END,
terom@15: 
terom@15:     #define LEX_CHAR(c, to)         { c, c, 0, to }
terom@15:     #define LEX_RANGE(l, r, to)     { l, r, 0, to }
terom@15:     #define LEX_ALPHA(to)           LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to)
terom@15:     #define LEX_NUMBER(to)          LEX_RANGE('0', '9', to)
terom@15:     #define LEX_ALNUM(to)           LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to)
terom@15:     #define LEX_WHITESPACE(to)      LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to)
terom@15:     #define LEX_INVALID(c)          { c, c, LEX_TRANS_INVALID, 0 }
terom@15: 
terom@15:     #define LEX_DEFAULT(to)         { 0, 0, LEX_TRANS_DEFAULT, to } \
terom@15:                                   }
terom@15:     #define LEX_END                 { 0, 0, 0, 0 } \
terom@15:                                   }
terom@15: 
terom@15: /*
terom@16:  * Helpers for handling states
terom@16:  */ 
terom@16: #define LEX_STATE_NAME(lex, state) ((state) ? (lex)->state_list[(state) - 1].name : "...")
terom@16: 
terom@16: /*
terom@15:  * Lex it!
terom@15:  *
terom@15:  * Return zero to indiciate that the input was valid, nonzero otherwise.
terom@15:  */
terom@15: int lexer (const struct lex *lex, const char *input, void *arg);
terom@15: 
terom@15: #endif /* LIB_LEXER_H */