src/lib/lexer.h
changeset 13 385b9a10d096
child 14 115067dfba55
equal deleted inserted replaced
12:7f159ee3a3ff 13:385b9a10d096
       
     1 #ifndef LIB_LEXER_H
       
     2 #define LIB_LEXER_H
       
     3 
       
     4 /*
       
     5  * Simple FSM lexing
       
     6  *
       
     7  * The lexer is implemented as a Finite State Machine, consisting for a number of states, which then contain a set of
       
     8  * transitions, which move the lexer from state to state based on each char of input at a time.
       
     9  *
       
    10  * Whenever the state changes, the token callback is triggered with the collected token data.
       
    11  */
       
    12 
       
    13 /*
       
    14  * A transition from one state to another.
       
    15  */
       
    16 struct lex_transition {
       
    17     // applies to chars [left, right]
       
    18     char left, right;
       
    19     
       
    20     // next state to enter
       
    21     int next_state;
       
    22 };
       
    23 
       
    24 /*
       
    25  * A state
       
    26  */
       
    27 struct lex_state {
       
    28     // the state name (for debugging)
       
    29     const char *name;
       
    30 
       
    31     // list of transitions for this state, terminated by a transition with next_state=0
       
    32     struct lex_transition *trans_list;
       
    33 
       
    34 
       
    35 };
       
    36 
       
    37 /*
       
    38  * Lex machine
       
    39  */
       
    40 struct lex {
       
    41     // number of states
       
    42     size_t state_count;
       
    43 
       
    44     // array of lex_states, indexable by the state id.
       
    45     struct lex_state *state_list;
       
    46 
       
    47     /*
       
    48      * Core token handler. Everytime a full token is lexed (i.e. the state changes), this will be called.
       
    49      * `this_token` represents the full token that was parsed, and `token_data` is the token's value. `next_token`
       
    50      * is the state that terminated this token, and `prev_token` was the token before this one.
       
    51      *
       
    52      * `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be
       
    53      * modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it.
       
    54      *
       
    55      * Return zero to have lexing continue, nonzero to stop lexing.
       
    56      */
       
    57     int (*lex_token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg);
       
    58 
       
    59     /*
       
    60      * Called on every char handled by the lexer. `this_token` is the state of the token that the char belongs to.
       
    61      *
       
    62      * Return zero to have lexing continue, nonzero to stop lexing.
       
    63      */
       
    64     int (*lex_char_fn) (int this_token, char token_char, void *arg);
       
    65 
       
    66     /*
       
    67      * Called when the end of input has been reached, `last_token` is the state that we terminated in.
       
    68      *
       
    69      * Return zero to indiciate that the input was valid, nonzero to indicate an error.
       
    70      */
       
    71     int (*lex_end_fn) (int last_token, void *arg);
       
    72 };
       
    73 
       
    74 /*
       
    75  * Helper macros for building the state_list
       
    76  */
       
    77 #define LEX_STATE(enum_val)     { #enum_val, {
       
    78 
       
    79     #define LEX_CHAR(c, to)         { c, c, to },
       
    80     #define LEX_RANGE(l, r, to)     { l, r, to },
       
    81     #define LEX_ALPHA(to)           LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to)
       
    82     #define LEX_NUMBER(to)          LEX_RANGE('0', '9', to)
       
    83     #define LEX_ALNUM(to)           LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to)
       
    84     #define LEX_WHITESPACE(to)      LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to)
       
    85 
       
    86 #define LEX_STATE_END               {0, 0, 0} \
       
    87                                 } }
       
    88 
       
    89 /*
       
    90  * Lex it!
       
    91  *
       
    92  * Return zero to indiciate that the input was valid, nonzero otherwise.
       
    93  */
       
    94 int lexer (struct lex *lex, const char *input, void *arg);
       
    95 
       
    96 #endif /* LIB_LEXER_H */