# HG changeset patch # User Tero Marttila # Date 1223393883 -10800 # Node ID 385b9a10d096561ffbbc4e3260725bda9c3c05c7 # Parent 7f159ee3a3ffae736980b0ef2d37ddcd4e0fc74c inital playing around with a lexer/url parser diff -r 7f159ee3a3ff -r 385b9a10d096 src/lib/lexer.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib/lexer.h Tue Oct 07 18:38:03 2008 +0300 @@ -0,0 +1,96 @@ +#ifndef LIB_LEXER_H +#define LIB_LEXER_H + +/* + * Simple FSM lexing + * + * The lexer is implemented as a Finite State Machine, consisting for a number of states, which then contain a set of + * transitions, which move the lexer from state to state based on each char of input at a time. + * + * Whenever the state changes, the token callback is triggered with the collected token data. + */ + +/* + * A transition from one state to another. + */ +struct lex_transition { + // applies to chars [left, right] + char left, right; + + // next state to enter + int next_state; +}; + +/* + * A state + */ +struct lex_state { + // the state name (for debugging) + const char *name; + + // list of transitions for this state, terminated by a transition with next_state=0 + struct lex_transition *trans_list; + + +}; + +/* + * Lex machine + */ +struct lex { + // number of states + size_t state_count; + + // array of lex_states, indexable by the state id. + struct lex_state *state_list; + + /* + * Core token handler. Everytime a full token is lexed (i.e. the state changes), this will be called. + * `this_token` represents the full token that was parsed, and `token_data` is the token's value. `next_token` + * is the state that terminated this token, and `prev_token` was the token before this one. + * + * `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be + * modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it. + * + * Return zero to have lexing continue, nonzero to stop lexing. + */ + int (*lex_token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg); + + /* + * Called on every char handled by the lexer. `this_token` is the state of the token that the char belongs to. + * + * Return zero to have lexing continue, nonzero to stop lexing. + */ + int (*lex_char_fn) (int this_token, char token_char, void *arg); + + /* + * Called when the end of input has been reached, `last_token` is the state that we terminated in. + * + * Return zero to indiciate that the input was valid, nonzero to indicate an error. + */ + int (*lex_end_fn) (int last_token, void *arg); +}; + +/* + * Helper macros for building the state_list + */ +#define LEX_STATE(enum_val) { #enum_val, { + + #define LEX_CHAR(c, to) { c, c, to }, + #define LEX_RANGE(l, r, to) { l, r, to }, + #define LEX_ALPHA(to) LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to) + #define LEX_NUMBER(to) LEX_RANGE('0', '9', to) + #define LEX_ALNUM(to) LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to) + #define LEX_WHITESPACE(to) LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to) + +#define LEX_STATE_END {0, 0, 0} \ + } } + +/* + * Lex it! + * + * Return zero to indiciate that the input was valid, nonzero otherwise. + */ +int lexer (struct lex *lex, const char *input, void *arg); + +#endif /* LIB_LEXER_H */ diff -r 7f159ee3a3ff -r 385b9a10d096 src/lib/url.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib/url.c Tue Oct 07 18:38:03 2008 +0300 @@ -0,0 +1,48 @@ + +#include "url.h" +#include "lexer.h" + +enum url_tokens { + URL_INVALID, + + URL_SCHEME, + URL_SCHEME_SEP, + URL_SCHEME_END_COL, + URL_SCHEME_END_SLASH1, + URL_SCHEME_END_SLASH2, + + URL_USERNAME, + URL_PASSWORD_SEP, + URL_PASSWORD, + URL_USERNAME_END, + + URL_HOSTNAME, + + URL_SERVICE_SEP, + URL_SERVICE, + + URL_PATH_START, + URL_PATH, + + URL_OPT_START, + URL_OPT_KEY, + URL_OPT_EQ, + URL_OPT_VAL, + URL_OPT_SEP, + + URL_MAX, +}; + +static struct lex *url_lex = { + .state_count = URL_MAX, + .stae_list = { + LEX_STATE(URL_SCHEME) + LEX_ALNUM ( URL_SCHEME ), + LEX_CHAR ( '+', URL_SCHEME_SEP ), + LEX_STATE_END, + + + + }, +} + diff -r 7f159ee3a3ff -r 385b9a10d096 src/lib/url.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib/url.h Tue Oct 07 18:38:03 2008 +0300 @@ -0,0 +1,53 @@ +#ifndef LIB_URL_H +#define LIB_URL_H + +/* + * A trivial parser for simple URLs + * + * [ [ "+" [ ... ] ] "://" ] [ [ ":" ] "@" ] [ ":" ] [ "/" ] [ "?" [ [ "=" ] ] [ "&" [ [ "=" ] ] [ ... ] ] + * + * example.com + * tcp://example.com:7348/ + * psql://postgres@localhost/test_db?charset=utf8 + * + */ + +/* + * The schema + */ +struct url_schema { + size_t count; + const char **list; +}; + +/* + * The options at the end + */ +struct url_opts { + size_t count; + struct url_opt { + const char *key; + const char *value; + } *list; +}; + +/* + * A parsed URL + */ +struct url { + struct url_schema *schema; + const char *username; + const char *password; + const char *hostname; + const char *service; + const char *path; + struct url_opts *opts; +}; + +/* + * Parse the given `text` as an URL, returning the result in `url`. Optional fields that are missing in the text will + * cause those values to be returned unmodified. + */ +int url_parse (struct url *url, const char *text); + +#endif /* LIB_URL_H */