# HG changeset patch # User Tero Marttila # Date 1223400695 -10800 # Node ID 115067dfba554e6d8bcab914c308a9dfd08ee74f # Parent 385b9a10d096561ffbbc4e3260725bda9c3c05c7 more intermediate work diff -r 385b9a10d096 -r 115067dfba55 src/lib/lexer.h --- a/src/lib/lexer.h Tue Oct 07 18:38:03 2008 +0300 +++ b/src/lib/lexer.h Tue Oct 07 20:31:35 2008 +0300 @@ -11,27 +11,46 @@ */ /* + * Transition flags + */ +enum lex_transition_flags { + LEX_TRANS_DEFAULT = 0x01, + LEX_TRANS_FINAL = 0x02, +}; + +/* * A transition from one state to another. */ struct lex_transition { // applies to chars [left, right] char left, right; + // flags from lex_transition_flags + char flags; + // next state to enter int next_state; }; /* + * State flags + */ +enum lex_state_flags { + LEX_STATE_END = 0x01; +}; + +/* * A state */ struct lex_state { // the state name (for debugging) const char *name; + // flags from lex_state_flags + char flags; + // list of transitions for this state, terminated by a transition with next_state=0 struct lex_transition *trans_list; - - }; /* @@ -54,43 +73,46 @@ * * Return zero to have lexing continue, nonzero to stop lexing. */ - int (*lex_token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg); + int (*token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg); /* * Called on every char handled by the lexer. `this_token` is the state of the token that the char belongs to. * * Return zero to have lexing continue, nonzero to stop lexing. */ - int (*lex_char_fn) (int this_token, char token_char, void *arg); + int (*char_fn) (int this_token, char token_char, void *arg); /* * Called when the end of input has been reached, `last_token` is the state that we terminated in. * * Return zero to indiciate that the input was valid, nonzero to indicate an error. */ - int (*lex_end_fn) (int last_token, void *arg); + int (*end_fn) (int last_token, void *arg); }; /* * Helper macros for building the state_list */ -#define LEX_STATE(enum_val) { #enum_val, { +#define LEX_STATE(enum_val) { #enum_val, 0, +#define LEX_STATE_END(enum_val) { #enum_val, LEX_STATE_END, - #define LEX_CHAR(c, to) { c, c, to }, - #define LEX_RANGE(l, r, to) { l, r, to }, + #define LEX_CHAR(c, to) { c, c, 0, to }, + #define LEX_RANGE(l, r, to) { l, r, 0, to }, #define LEX_ALPHA(to) LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to) #define LEX_NUMBER(to) LEX_RANGE('0', '9', to) #define LEX_ALNUM(to) LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to) #define LEX_WHITESPACE(to) LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to) -#define LEX_STATE_END {0, 0, 0} \ - } } + #define LEX_DEFAULT(to) { 0, 0, LEX_TRANS_DEFAULT, to } \ + } + #define LEX_END { 0, 0, 0, 0 } \ + } /* * Lex it! * * Return zero to indiciate that the input was valid, nonzero otherwise. */ -int lexer (struct lex *lex, const char *input, void *arg); +int lexer (const struct lex *lex, const char *input, void *arg); #endif /* LIB_LEXER_H */ diff -r 385b9a10d096 -r 115067dfba55 src/lib/url.c --- a/src/lib/url.c Tue Oct 07 18:38:03 2008 +0300 +++ b/src/lib/url.c Tue Oct 07 20:31:35 2008 +0300 @@ -2,14 +2,25 @@ #include "url.h" #include "lexer.h" -enum url_tokens { +enum url_token { URL_INVALID, + URL_BEGIN, + + // kludge to resolve ambiguous URL_SCHEME/URL_USERNAME+URL_PASSWORD/URL_HOSTNAME+URL_SERVICE at the beginning + URL_BEGIN_ALNUM, + URL_BEGIN_COLON, + URL_SCHEME, URL_SCHEME_SEP, URL_SCHEME_END_COL, URL_SCHEME_END_SLASH1, URL_SCHEME_END_SLASH2, + + // kludge to resolve ambiguous URL_USERNAME+URL_PASSWORD/URL_HOSTNAME+URL_SERVICE after a scheme + URL_USERHOST_ALNUM, + URL_USERHOST_COLON, + URL_USERHOST_ALNUM2, URL_USERNAME, URL_PASSWORD_SEP, @@ -29,20 +40,231 @@ URL_OPT_EQ, URL_OPT_VAL, URL_OPT_SEP, + + URL_END, URL_MAX, }; -static struct lex *url_lex = { - .state_count = URL_MAX, - .stae_list = { - LEX_STATE(URL_SCHEME) - LEX_ALNUM ( URL_SCHEME ), - LEX_CHAR ( '+', URL_SCHEME_SEP ), - LEX_STATE_END, +/* + * Parser state + */ +struct url_state { + struct url *url; +}; - }, +static int url_lex_token (int _this_token, char *token_data, int _next_token, int _prev_token, void *arg) { + enum url_token this_token = _this_token, next_token = _next_token, prev_token = _prev_token; + struct url_state *state = arg; + } +static int url_lex_end (int _last_token, void *arg) { + enum url_token last_token = _last_token; + struct url_state *state = arg; + +} + +static struct lex url_lex = { + .state_count = URL_MAX, + .state_list = { + LEX_STATE ( URL_BEGIN ) { + LEX_ALNUM ( URL_BEGIN_ALNUM ), + LEX_CHAR ( ':', URL_SERVICE_SEP ), + LEX_CHAR ( '/', URL_PATH_START ), + LEX_CHAR ( '?', URL_OPT_START ), + LEX_END + }, + + // this can be URL_SCHEME, URL_USERNAME or URL_HOSTNAME + LEX_STATE_END ( URL_BEGIN_ALNUM ) { + LEX_ALNUM ( URL_BEGIN_ALNUM ), + LEX_CHAR ( '+', URL_SCHEME_SEP ), // it was URL_SCHEME + LEX_CHAR ( ':', URL_BEGIN_COLON ), + LEX_CHAR ( '@', URL_USERNAME_END ), // it was URL_USERNAME + LEX_CHAR ( '/', URL_PATH_START ), // it was URL_HOSTNAME + LEX_CHAR ( '?', URL_OPT_START ), // it was URL_HOSTNAME + LEX_END + }, + + // this can be URL_SCHEME_END_COL, URL_USERNAME_END or URL_SERVICE_SEP + LEX_STATE ( URL_BEGIN_COLON ) { + LEX_CHAR ( '/', URL_SCHEME_END_SLASH1 ), // it was URL_SCHEME + LEX_ALNUM ( URL_USERHOST_ALNUM2 ), + LEX_END + }, + + + LEX_STATE ( URL_SCHEME ) { + LEX_ALNUM ( URL_SCHEME ), + LEX_CHAR ( '+', URL_SCHEME_SEP ), + LEX_CHAR ( ':', URL_SCHEME_END_COL ), + LEX_END + }, + + LEX_STATE ( URL_SCHEME_SEP ) { + LEX_ALNUM ( URL_SCHEME ), + LEX_END + }, + + LEX_STATE ( URL_SCHEME_END_COL ) { + LEX_CHAR ( '/', URL_SCHEME_END_SLASH1 ), + LEX_END + }, + + LEX_STATE ( URL_SCHEME_END_SLASH1 ) { + LEX_CHAR ( '/', URL_SCHEME_END_SLASH2 ), + LEX_END + }, + + LEX_STATE_END ( URL_SCHEME_END_SLASH2 ) { + LEX_ALNUM ( URL_USERHOST_ALNUM ), + LEX_CHAR ( ':', URL_SERVICE_SEP ), + LEX_CHAR ( '/', URL_PATH_START ), + LEX_CHAR ( '?', URL_OPT_START ), + LEX_END + }, + + // this can be URL_USERNAME or URL_HOSTNAME + LEX_STATE_END ( URL_USERHOST_ALNUM ) { + LEX_ALNUM ( URL_USERHOST_ALNUM ), + LEX_CHAR ( ':', URL_USERHOST_COLON ), + LEX_CHAR ( '@', URL_USERNAME_END ), // it was URL_USERNAME + LEX_CHAR ( '/', URL_PATH_START ), // it was URL_HOSTNAME + LEX_CHAR ( '?', URL_OPT_START ), // it was URL_HOSTNAME + LEX_END + } + + // this can be URL_USERNAME_END or URL_SERVICE_SEP + LEX_STATE ( URL_USERHOST_COLON ) { + LEX_ALNUM ( URL_USERHOST_ALNUM2 ), + LEX_END + }, + + // this can be URL_PASSWORD or URL_SERVICE + LEX_STATE_END ( URL_USERHOST_ALNUM2 ) { + LEX_ALNUM ( URL_USERHOST_ALNUM ), + LEX_CHAR ( '@', URL_USERNAME_END ), // it was URL_PASSSWORD + LEX_CHAR ( '/', URL_PATH_START ), // it was URL_SERVICE + LEX_CHAR ( '?', URL_OPT_START ), // it was URL_SERVICE + LEX_END + }, + + // dummy states, covered by URL_USERHOST_ALNUM/URL_USERHOST_COLON/URL_USERHOST_ALNUM2 + LEX_STATE ( URL_USERNAME ) { + LEX_END + }, + + LEX_STATE ( URL_PASSWORD_SEP ) { + LEX_END + }, + + LEX_STATE ( URL_PASSWORD ) { + LEX_END + }, + + + LEX_STATE_END ( URL_USERNAME_END ) { + LEX_ALNUM ( URL_HOSTNAME ), + LEX_CHAR ( ':', URL_SERVICE_SEP ), + LEX_CHAR ( '/', URL_PATH_START ), + LEX_CHAR ( '?', URL_OPT_START ), + LEX_END + }, + + + LEX_STATE_END ( URL_HOSTNAME ) { + LEX_ALNUM ( URL_HOSTNAME ), + LEX_CHAR ( ':', URL_SERVICE_SEP ), + LEX_CHAR ( '/', URL_PATH_START ), + LEX_CHAR ( '?', URL_OPT_START ), + LEX_END + }, + + + LEX_STATE ( URL_SERVICE_SEP ) { + LEX_ALNUM ( URL_SERVICE ), + LEX_CHAR ( '/', URL_PATH_START ), + LEX_CHAR ( '?', URL_OPT_START ), + LEX_END + }, + + LEX_STATE_END ( URL_SERVICE ) { + LEX_ALNUM ( URL_SERVICE ), + LEX_CHAR ( '/', URL_PATH_START ), + LEX_CHAR ( '?', URL_OPT_START ), + LEX_END + }, + + + LEX_STATE_END ( URL_PATH_START ) { + LEX_CHAR ( '?', URL_OPT_START ), + LEX_DEFAULT ( URL_PATH ), + }, + + LEX_STATE_END ( URL_PATH ) { + LEX_CHAR ( '?', URL_OPT_START ), + LEX_DEFAULT ( URL_PATH ), + }, + + + LEX_STATE_END ( URL_OPT_START ) { + LEX_CHAR ( '&', URL_OPT_SEP ), + LEX_CHAR ( '=', URL_ERROR ), + LEX_DEFAULT ( URL_OPT_KEY ), + }, + + LEX_STATE_END ( URL_OPT_KEY ) { + LEX_CHAR ( '&', URL_OPT_SEP ), + LEX_CHAR ( '=', URL_OPT_EQ ), + LEX_DEFAULT ( URL_OPT_KEY ), + }, + + LEX_STATE_END ( URL_OPT_EQ ) { + LEX_CHAR ( '&', URL_OPT_SEP ), + LEX_DEFAULT ( URL_OPT_VAL ), + }, + + LEX_STATE_END ( URL_OPT_VAL ) { + LEX_CHAR ( '&', URL_OPT_SEP ), + LEX_DEFAULT ( URL_OPT_VAL ), + }, + + LEX_STATE_END ( URL_OPT_SEP ) { + LEX_CHAR ( '&', URL_OPT_SEP ), + LEX_CHAR ( '=', URL_ERROR ), + LEX_DEFAULT ( URL_OPT_KEY ), + }, + + LEX_STATE ( URL_ERROR ) { + LEX_END + }, + + URL_MAX, + }, + + .token_fn = url_lex_token, + .char_fn = NULL, + .end_fn = url_lex_end, +}; + +int url_parse (struct url *url, const char *text) { + struct url_state state; ZINIT(state); + int ret; + + // set up state + state.url = url; + + // parse it + if ((ret = lexer(&url_lex, text, &state))) + ERROR("invalid URL"); + + // success + return 0; + +error: + return -1; +} + diff -r 385b9a10d096 -r 115067dfba55 src/lib/url.h --- a/src/lib/url.h Tue Oct 07 18:38:03 2008 +0300 +++ b/src/lib/url.h Tue Oct 07 20:31:35 2008 +0300 @@ -4,7 +4,7 @@ /* * A trivial parser for simple URLs * - * [ [ "+" [ ... ] ] "://" ] [ [ ":" ] "@" ] [ ":" ] [ "/" ] [ "?" [ [ "=" ] ] [ "&" [ [ "=" ] ] [ ... ] ] + * [ [ "+" [ ... ] ] "://" ] [ [ ":" ] "@" ] [  ] [ ":" ] [ "/" ] [ "?" [ [ "=" ] ] [ "&" [ [ "=" ] ] [ ... ] ] * * example.com * tcp://example.com:7348/ @@ -47,6 +47,8 @@ /* * Parse the given `text` as an URL, returning the result in `url`. Optional fields that are missing in the text will * cause those values to be returned unmodified. + * + * Returns zero if the url was valid and was parsed, nonzero if it was invalid. */ int url_parse (struct url *url, const char *text);