src/lib/lex.h
author Tero Marttila <terom@fixme.fi>
Thu, 09 Oct 2008 00:33:37 +0300
changeset 16 74fb62022fb3
parent 15 a8d183e79ed9
permissions -rw-r--r--
starting to work
15
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     1
#ifndef LIB_LEXER_H
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     2
#define LIB_LEXER_H
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     3
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     4
/*
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     5
 * Simple FSM lexing
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     6
 *
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     7
 * The lexer is implemented as a Finite State Machine, consisting for a number of states, which then contain a set of
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     8
 * transitions, which move the lexer from state to state based on each char of input at a time.
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     9
 *
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    10
 * Whenever the state changes, the token callback is triggered with the collected token data.
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    11
 */
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    12
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    13
#include <sys/types.h>
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    14
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    15
/*
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    16
 * Transition flags
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    17
 */
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    18
enum lex_transition_flags {
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    19
    LEX_TRANS_DEFAULT   = 0x01,
16
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
    20
    /* not supported
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
    21
    LEX_TRANS_FINAL     = 0x02, */
15
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    22
    LEX_TRANS_INVALID   = 0x04,
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    23
};
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    24
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    25
/*
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    26
 * A transition from one state to another.
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    27
 */
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    28
struct lex_transition {
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    29
    // applies to chars [left, right]
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    30
    char left, right;
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    31
    
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    32
    // flags from lex_transition_flags
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    33
    char flags;
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    34
    
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    35
    // next state to enter
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    36
    int next_state;
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    37
};
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    38
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    39
/*
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    40
 * State flags
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    41
 */ 
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    42
enum lex_state_flags {
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    43
    LEX_STATE_END       = 0x01,
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    44
};
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    45
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    46
/*
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    47
 * A state
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    48
 */
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    49
struct lex_state {
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    50
    // the state name (for debugging)
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    51
    const char *name;
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    52
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    53
    // flags from lex_state_flags
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    54
    char flags;
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    55
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    56
    // list of transitions for this state, terminated by a transition with next_state=0
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    57
    struct lex_transition trans_list[15];
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    58
};
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    59
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    60
/*
16
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
    61
 * Special states, these are all defined as zero
15
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    62
 */
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    63
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    64
// shows up in token_fn as the value of next_token when this_token is the last token.
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    65
#define LEX_EOF 0
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    66
16
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
    67
// shows up as the initial value of prev_token
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
    68
#define LEX_INITIAL 0
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
    69
15
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    70
/*
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    71
 * Lex machine
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    72
 */
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    73
struct lex {
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    74
    /*
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    75
     * Core token handler. Everytime a full token is lexed (i.e. the state changes), this will be called.
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    76
     * `this_token` represents the full token that was parsed, and `token_data` is the token's value. `next_token`
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    77
     * is the state that terminated this token, and `prev_token` was the token before this one.
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    78
     *
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    79
     * `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    80
     * modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it.
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    81
     *
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    82
     * Return zero to have lexing continue, nonzero to stop lexing.
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    83
     */
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    84
    int (*token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg);
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    85
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    86
    /*
16
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
    87
     * Called on every char handled by the lexer.
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
    88
     *
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
    89
     * The NUL byte at the end of the input string is not passed to char_fn (why not?).
15
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    90
     *
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    91
     * Return zero to have lexing continue, nonzero to stop lexing.
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    92
     */
16
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
    93
    int (*char_fn) (char token_char, int from_token, int to_token, void *arg);
15
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    94
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    95
    /*
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    96
     * Called when the end of input has been reached, `last_token` is the state that we terminated in.
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    97
     *
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    98
     * Return zero to indiciate that the input was valid, nonzero to indicate an error.
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    99
     */
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   100
    int (*end_fn) (int last_token, void *arg);
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   101
    
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   102
    // number of states
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   103
    size_t state_count;
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   104
16
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
   105
    // initial state
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
   106
    int initial_state;
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
   107
15
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   108
    // array of lex_states, indexable by the state id.
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   109
    struct lex_state state_list[];
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   110
};
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   111
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   112
/*
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   113
 * Helper macros for building the state_list
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   114
 */
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   115
#define LEX_STATE(enum_val)     { #enum_val, 0,
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   116
#define LEX_STATE_END(enum_val) { #enum_val, LEX_STATE_END,
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   117
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   118
    #define LEX_CHAR(c, to)         { c, c, 0, to }
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   119
    #define LEX_RANGE(l, r, to)     { l, r, 0, to }
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   120
    #define LEX_ALPHA(to)           LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to)
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   121
    #define LEX_NUMBER(to)          LEX_RANGE('0', '9', to)
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   122
    #define LEX_ALNUM(to)           LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to)
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   123
    #define LEX_WHITESPACE(to)      LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to)
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   124
    #define LEX_INVALID(c)          { c, c, LEX_TRANS_INVALID, 0 }
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   125
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   126
    #define LEX_DEFAULT(to)         { 0, 0, LEX_TRANS_DEFAULT, to } \
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   127
                                  }
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   128
    #define LEX_END                 { 0, 0, 0, 0 } \
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   129
                                  }
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   130
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   131
/*
16
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
   132
 * Helpers for handling states
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
   133
 */ 
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
   134
#define LEX_STATE_NAME(lex, state) ((state) ? (lex)->state_list[(state) - 1].name : "...")
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
   135
74fb62022fb3 starting to work
Tero Marttila <terom@fixme.fi>
parents: 15
diff changeset
   136
/*
15
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   137
 * Lex it!
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   138
 *
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   139
 * Return zero to indiciate that the input was valid, nonzero otherwise.
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   140
 */
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   141
int lexer (const struct lex *lex, const char *input, void *arg);
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   142
a8d183e79ed9 look ma, it compiles\!
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
   143
#endif /* LIB_LEXER_H */