src/lib/lex.h
changeset 56 9dfc861273e5
parent 42 40a3b13ffc9d
child 57 527d23bf6441
equal deleted inserted replaced
42:40a3b13ffc9d 56:9dfc861273e5
     1 #ifndef LIB_LEXER_H
       
     2 #define LIB_LEXER_H
       
     3 
       
     4 /*
       
     5  * Simple FSM lexing
       
     6  *
       
     7  * The lexer is implemented as a Finite State Machine, consisting for a number of states, which then contain a set of
       
     8  * transitions, which move the lexer from state to state based on each char of input at a time.
       
     9  *
       
    10  * Whenever the state changes, the token callback is triggered with the collected token data.
       
    11  */
       
    12 
       
    13 #include <sys/types.h>
       
    14 
       
    15 /*
       
    16  * Transition flags
       
    17  */
       
    18 enum lex_transition_flags {
       
    19     LEX_TRANS_DEFAULT   = 0x01,
       
    20     /* not supported
       
    21     LEX_TRANS_FINAL     = 0x02, */
       
    22     LEX_TRANS_INVALID   = 0x04,
       
    23 };
       
    24 
       
    25 /*
       
    26  * A transition from one state to another.
       
    27  */
       
    28 struct lex_transition {
       
    29     // applies to chars [left, right]
       
    30     char left, right;
       
    31     
       
    32     // flags from lex_transition_flags
       
    33     char flags;
       
    34     
       
    35     // next state to enter
       
    36     int next_state;
       
    37 };
       
    38 
       
    39 /*
       
    40  * State flags
       
    41  */ 
       
    42 enum lex_state_flags {
       
    43     LEX_STATE_END       = 0x01,
       
    44 };
       
    45 
       
    46 /*
       
    47  * A state
       
    48  */
       
    49 struct lex_state {
       
    50     // the state name (for debugging)
       
    51     const char *name;
       
    52 
       
    53     // flags from lex_state_flags
       
    54     char flags;
       
    55 
       
    56     // list of transitions for this state, terminated by a transition with next_state=0
       
    57     struct lex_transition trans_list[15];
       
    58 };
       
    59 
       
    60 /*
       
    61  * Special states, these are all defined as zero
       
    62  */
       
    63 
       
    64 // shows up in token_fn as the value of next_token when this_token is the last token.
       
    65 #define LEX_EOF 0
       
    66 
       
    67 // shows up as the initial value of prev_token
       
    68 #define LEX_INITIAL 0
       
    69 
       
    70 /*
       
    71  * Lex machine
       
    72  */
       
    73 struct lex {
       
    74     /*
       
    75      * Core token handler. Everytime a full token is lexed (i.e. the state changes), this will be called.
       
    76      * `this_token` represents the full token that was parsed, and `token_data` is the token's value. `next_token`
       
    77      * is the state that terminated this token, and `prev_token` was the token before this one.
       
    78      *
       
    79      * `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be
       
    80      * modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it.
       
    81      *
       
    82      * Return zero to have lexing continue, nonzero to stop lexing.
       
    83      */
       
    84     int (*token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg);
       
    85 
       
    86     /*
       
    87      * Called on every char handled by the lexer.
       
    88      *
       
    89      * The NUL byte at the end of the input string is not passed to char_fn (why not?).
       
    90      *
       
    91      * Return zero to have lexing continue, nonzero to stop lexing.
       
    92      */
       
    93     int (*char_fn) (char token_char, int from_token, int to_token, void *arg);
       
    94 
       
    95     /*
       
    96      * Called when the end of input has been reached, `last_token` is the state that we terminated in.
       
    97      *
       
    98      * Return zero to indiciate that the input was valid, nonzero to indicate an error.
       
    99      */
       
   100     int (*end_fn) (int last_token, void *arg);
       
   101     
       
   102     // number of states
       
   103     size_t state_count;
       
   104 
       
   105     // initial state
       
   106     int initial_state;
       
   107 
       
   108     // array of lex_states, indexable by the state id.
       
   109     struct lex_state state_list[];
       
   110 };
       
   111 
       
   112 /*
       
   113  * Helper macros for building the state_list
       
   114  */
       
   115 #define LEX_STATE(enum_val)     { #enum_val, 0,
       
   116 #define LEX_STATE_END(enum_val) { #enum_val, LEX_STATE_END,
       
   117 
       
   118     #define LEX_CHAR(c, to)         { c, c, 0, to }
       
   119     #define LEX_RANGE(l, r, to)     { l, r, 0, to }
       
   120     #define LEX_ALPHA(to)           LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to)
       
   121     #define LEX_NUMBER(to)          LEX_RANGE('0', '9', to)
       
   122     #define LEX_ALNUM(to)           LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to)
       
   123     #define LEX_WHITESPACE(to)      LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to)
       
   124     #define LEX_INVALID(c)          { c, c, LEX_TRANS_INVALID, 0 }
       
   125 
       
   126     #define LEX_DEFAULT(to)         { 0, 0, LEX_TRANS_DEFAULT, to } \
       
   127                                   }
       
   128     #define LEX_END                 { 0, 0, 0, 0 } \
       
   129                                   }
       
   130 
       
   131 /*
       
   132  * Helpers for handling states
       
   133  */ 
       
   134 #define LEX_STATE_NAME(lex, state) ((state) ? (lex)->state_list[(state) - 1].name : "...")
       
   135 
       
   136 /*
       
   137  * Lex it!
       
   138  *
       
   139  * Return zero to indiciate that the input was valid, nonzero otherwise.
       
   140  */
       
   141 int lexer (const struct lex *lex, const char *input, void *arg);
       
   142 
       
   143 #endif /* LIB_LEXER_H */