src/lib/lexer.h
author Tero Marttila <terom@fixme.fi>
Tue, 07 Oct 2008 18:38:03 +0300
changeset 13 385b9a10d096
child 14 115067dfba55
permissions -rw-r--r--
inital playing around with a lexer/url parser
13
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     1
#ifndef LIB_LEXER_H
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     2
#define LIB_LEXER_H
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     3
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     4
/*
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     5
 * Simple FSM lexing
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     6
 *
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     7
 * The lexer is implemented as a Finite State Machine, consisting for a number of states, which then contain a set of
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     8
 * transitions, which move the lexer from state to state based on each char of input at a time.
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
     9
 *
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    10
 * Whenever the state changes, the token callback is triggered with the collected token data.
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    11
 */
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    12
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    13
/*
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    14
 * A transition from one state to another.
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    15
 */
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    16
struct lex_transition {
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    17
    // applies to chars [left, right]
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    18
    char left, right;
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    19
    
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    20
    // next state to enter
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    21
    int next_state;
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    22
};
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    23
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    24
/*
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    25
 * A state
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    26
 */
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    27
struct lex_state {
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    28
    // the state name (for debugging)
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    29
    const char *name;
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    30
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    31
    // list of transitions for this state, terminated by a transition with next_state=0
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    32
    struct lex_transition *trans_list;
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    33
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    34
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    35
};
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    36
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    37
/*
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    38
 * Lex machine
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    39
 */
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    40
struct lex {
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    41
    // number of states
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    42
    size_t state_count;
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    43
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    44
    // array of lex_states, indexable by the state id.
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    45
    struct lex_state *state_list;
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    46
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    47
    /*
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    48
     * Core token handler. Everytime a full token is lexed (i.e. the state changes), this will be called.
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    49
     * `this_token` represents the full token that was parsed, and `token_data` is the token's value. `next_token`
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    50
     * is the state that terminated this token, and `prev_token` was the token before this one.
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    51
     *
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    52
     * `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    53
     * modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it.
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    54
     *
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    55
     * Return zero to have lexing continue, nonzero to stop lexing.
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    56
     */
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    57
    int (*lex_token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg);
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    58
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    59
    /*
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    60
     * Called on every char handled by the lexer. `this_token` is the state of the token that the char belongs to.
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    61
     *
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    62
     * Return zero to have lexing continue, nonzero to stop lexing.
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    63
     */
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    64
    int (*lex_char_fn) (int this_token, char token_char, void *arg);
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    65
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    66
    /*
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    67
     * Called when the end of input has been reached, `last_token` is the state that we terminated in.
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    68
     *
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    69
     * Return zero to indiciate that the input was valid, nonzero to indicate an error.
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    70
     */
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    71
    int (*lex_end_fn) (int last_token, void *arg);
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    72
};
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    73
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    74
/*
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    75
 * Helper macros for building the state_list
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    76
 */
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    77
#define LEX_STATE(enum_val)     { #enum_val, {
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    78
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    79
    #define LEX_CHAR(c, to)         { c, c, to },
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    80
    #define LEX_RANGE(l, r, to)     { l, r, to },
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    81
    #define LEX_ALPHA(to)           LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to)
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    82
    #define LEX_NUMBER(to)          LEX_RANGE('0', '9', to)
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    83
    #define LEX_ALNUM(to)           LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to)
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    84
    #define LEX_WHITESPACE(to)      LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to)
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    85
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    86
#define LEX_STATE_END               {0, 0, 0} \
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    87
                                } }
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    88
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    89
/*
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    90
 * Lex it!
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    91
 *
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    92
 * Return zero to indiciate that the input was valid, nonzero otherwise.
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    93
 */
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    94
int lexer (struct lex *lex, const char *input, void *arg);
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    95
385b9a10d096 inital playing around with a lexer/url parser
Tero Marttila <terom@fixme.fi>
parents:
diff changeset
    96
#endif /* LIB_LEXER_H */