src/lib/lexer.h
changeset 14 115067dfba55
parent 13 385b9a10d096
equal deleted inserted replaced
13:385b9a10d096 14:115067dfba55
     9  *
     9  *
    10  * Whenever the state changes, the token callback is triggered with the collected token data.
    10  * Whenever the state changes, the token callback is triggered with the collected token data.
    11  */
    11  */
    12 
    12 
    13 /*
    13 /*
       
    14  * Transition flags
       
    15  */
       
    16 enum lex_transition_flags {
       
    17     LEX_TRANS_DEFAULT   = 0x01,
       
    18     LEX_TRANS_FINAL     = 0x02,
       
    19 };
       
    20 
       
    21 /*
    14  * A transition from one state to another.
    22  * A transition from one state to another.
    15  */
    23  */
    16 struct lex_transition {
    24 struct lex_transition {
    17     // applies to chars [left, right]
    25     // applies to chars [left, right]
    18     char left, right;
    26     char left, right;
    19     
    27     
       
    28     // flags from lex_transition_flags
       
    29     char flags;
       
    30     
    20     // next state to enter
    31     // next state to enter
    21     int next_state;
    32     int next_state;
       
    33 };
       
    34 
       
    35 /*
       
    36  * State flags
       
    37  */ 
       
    38 enum lex_state_flags {
       
    39     LEX_STATE_END       = 0x01;
    22 };
    40 };
    23 
    41 
    24 /*
    42 /*
    25  * A state
    43  * A state
    26  */
    44  */
    27 struct lex_state {
    45 struct lex_state {
    28     // the state name (for debugging)
    46     // the state name (for debugging)
    29     const char *name;
    47     const char *name;
    30 
    48 
       
    49     // flags from lex_state_flags
       
    50     char flags;
       
    51 
    31     // list of transitions for this state, terminated by a transition with next_state=0
    52     // list of transitions for this state, terminated by a transition with next_state=0
    32     struct lex_transition *trans_list;
    53     struct lex_transition *trans_list;
    33 
       
    34 
       
    35 };
    54 };
    36 
    55 
    37 /*
    56 /*
    38  * Lex machine
    57  * Lex machine
    39  */
    58  */
    52      * `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be
    71      * `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be
    53      * modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it.
    72      * modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it.
    54      *
    73      *
    55      * Return zero to have lexing continue, nonzero to stop lexing.
    74      * Return zero to have lexing continue, nonzero to stop lexing.
    56      */
    75      */
    57     int (*lex_token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg);
    76     int (*token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg);
    58 
    77 
    59     /*
    78     /*
    60      * Called on every char handled by the lexer. `this_token` is the state of the token that the char belongs to.
    79      * Called on every char handled by the lexer. `this_token` is the state of the token that the char belongs to.
    61      *
    80      *
    62      * Return zero to have lexing continue, nonzero to stop lexing.
    81      * Return zero to have lexing continue, nonzero to stop lexing.
    63      */
    82      */
    64     int (*lex_char_fn) (int this_token, char token_char, void *arg);
    83     int (*char_fn) (int this_token, char token_char, void *arg);
    65 
    84 
    66     /*
    85     /*
    67      * Called when the end of input has been reached, `last_token` is the state that we terminated in.
    86      * Called when the end of input has been reached, `last_token` is the state that we terminated in.
    68      *
    87      *
    69      * Return zero to indiciate that the input was valid, nonzero to indicate an error.
    88      * Return zero to indiciate that the input was valid, nonzero to indicate an error.
    70      */
    89      */
    71     int (*lex_end_fn) (int last_token, void *arg);
    90     int (*end_fn) (int last_token, void *arg);
    72 };
    91 };
    73 
    92 
    74 /*
    93 /*
    75  * Helper macros for building the state_list
    94  * Helper macros for building the state_list
    76  */
    95  */
    77 #define LEX_STATE(enum_val)     { #enum_val, {
    96 #define LEX_STATE(enum_val)     { #enum_val, 0,
       
    97 #define LEX_STATE_END(enum_val) { #enum_val, LEX_STATE_END,
    78 
    98 
    79     #define LEX_CHAR(c, to)         { c, c, to },
    99     #define LEX_CHAR(c, to)         { c, c, 0, to },
    80     #define LEX_RANGE(l, r, to)     { l, r, to },
   100     #define LEX_RANGE(l, r, to)     { l, r, 0, to },
    81     #define LEX_ALPHA(to)           LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to)
   101     #define LEX_ALPHA(to)           LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to)
    82     #define LEX_NUMBER(to)          LEX_RANGE('0', '9', to)
   102     #define LEX_NUMBER(to)          LEX_RANGE('0', '9', to)
    83     #define LEX_ALNUM(to)           LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to)
   103     #define LEX_ALNUM(to)           LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to)
    84     #define LEX_WHITESPACE(to)      LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to)
   104     #define LEX_WHITESPACE(to)      LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to)
    85 
   105 
    86 #define LEX_STATE_END               {0, 0, 0} \
   106     #define LEX_DEFAULT(to)         { 0, 0, LEX_TRANS_DEFAULT, to } \
    87                                 } }
   107                                   }
       
   108     #define LEX_END                 { 0, 0, 0, 0 } \
       
   109                                   }
    88 
   110 
    89 /*
   111 /*
    90  * Lex it!
   112  * Lex it!
    91  *
   113  *
    92  * Return zero to indiciate that the input was valid, nonzero otherwise.
   114  * Return zero to indiciate that the input was valid, nonzero otherwise.
    93  */
   115  */
    94 int lexer (struct lex *lex, const char *input, void *arg);
   116 int lexer (const struct lex *lex, const char *input, void *arg);
    95 
   117 
    96 #endif /* LIB_LEXER_H */
   118 #endif /* LIB_LEXER_H */