terom@15: #define _GNU_SOURCE terom@15: #include terom@15: #include terom@13: terom@13: #include "url.h" terom@15: #include "lex.h" terom@15: #include "error.h" terom@15: #include "misc.h" terom@13: terom@14: enum url_token { terom@13: URL_INVALID, terom@13: terom@14: URL_BEGIN, terom@14: terom@14: // kludge to resolve ambiguous URL_SCHEME/URL_USERNAME+URL_PASSWORD/URL_HOSTNAME+URL_SERVICE at the beginning terom@14: URL_BEGIN_ALNUM, terom@14: URL_BEGIN_COLON, terom@14: terom@13: URL_SCHEME, terom@13: URL_SCHEME_SEP, terom@13: URL_SCHEME_END_COL, terom@13: URL_SCHEME_END_SLASH1, terom@13: URL_SCHEME_END_SLASH2, terom@14: terom@14: // kludge to resolve ambiguous URL_USERNAME+URL_PASSWORD/URL_HOSTNAME+URL_SERVICE after a scheme terom@14: URL_USERHOST_ALNUM, terom@14: URL_USERHOST_COLON, terom@14: URL_USERHOST_ALNUM2, terom@13: terom@13: URL_USERNAME, terom@13: URL_PASSWORD_SEP, terom@13: URL_PASSWORD, terom@13: URL_USERNAME_END, terom@13: terom@13: URL_HOSTNAME, terom@13: terom@13: URL_SERVICE_SEP, terom@13: URL_SERVICE, terom@13: terom@13: URL_PATH_START, terom@13: URL_PATH, terom@13: terom@13: URL_OPT_START, terom@13: URL_OPT_KEY, terom@13: URL_OPT_EQ, terom@13: URL_OPT_VAL, terom@13: URL_OPT_SEP, terom@14: terom@13: URL_MAX, terom@13: }; terom@13: terom@14: /* terom@14: * Parser state terom@14: */ terom@14: struct url_state { terom@15: // the URL to parse into terom@14: struct url *url; terom@15: terom@15: // our lookahead-kludge terom@15: const char *alnum, *alnum2; terom@15: terom@15: }; terom@13: terom@15: static int _url_append_scheme (struct url *url, const char *data) { terom@15: terom@15: } terom@13: terom@15: static int _url_append_opt_key (struct url *url, const char *key) { terom@15: terom@15: } terom@15: terom@15: static int _url_append_opt_val (struct url *url, const char *value) { terom@15: terom@15: } terom@13: terom@14: static int url_lex_token (int _this_token, char *token_data, int _next_token, int _prev_token, void *arg) { terom@14: enum url_token this_token = _this_token, next_token = _next_token, prev_token = _prev_token; terom@14: struct url_state *state = arg; terom@15: const char **copy_to = NULL; terom@13: terom@15: (void) prev_token; terom@15: terom@15: switch (this_token) { terom@15: case URL_BEGIN_ALNUM: terom@15: switch (next_token) { terom@15: case URL_SCHEME_SEP: terom@15: // store the scheme terom@15: if (_url_append_scheme(state->url, token_data)) terom@15: goto error; terom@15: terom@15: break; terom@15: terom@15: case URL_USERNAME_END: terom@15: // store the username terom@15: copy_to = &state->url->username; break; terom@15: terom@15: case URL_PATH_START: terom@15: case URL_OPT_START: terom@15: case LEX_EOF: terom@15: // store the hostname terom@15: copy_to = &state->url->hostname; break; terom@14: terom@15: case URL_BEGIN_COLON: terom@15: // gah... terom@15: copy_to = &state->alnum; break; terom@15: terom@15: terom@15: default: terom@15: FATAL("weird next token"); terom@15: } terom@15: terom@15: break; terom@15: terom@15: case URL_BEGIN_COLON: terom@15: switch (next_token) { terom@15: case URL_SCHEME_END_SLASH1: terom@15: // store the schema terom@15: if (_url_append_scheme(state->url, token_data)) terom@15: goto error; terom@15: terom@15: break; terom@15: terom@15: case URL_USERHOST_ALNUM2: terom@15: // gah.. terom@15: break; terom@15: terom@15: default: terom@15: FATAL("weird next token"); terom@15: } terom@15: terom@15: break; terom@15: terom@15: case URL_SCHEME: terom@15: // store the scheme terom@15: if (_url_append_scheme(state->url, token_data)) terom@15: goto error; terom@15: terom@15: break; terom@15: terom@15: case URL_SCHEME_SEP: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_SCHEME_END_COL: terom@15: case URL_SCHEME_END_SLASH1: terom@15: case URL_SCHEME_END_SLASH2: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_USERHOST_ALNUM: terom@15: switch (next_token) { terom@15: case URL_USERNAME_END: terom@15: // store the username terom@15: copy_to = &state->url->username; break; terom@15: terom@15: case URL_PATH_START: terom@15: case URL_OPT_START: terom@15: case LEX_EOF: terom@15: // store the hostname terom@15: copy_to = &state->url->hostname; break; terom@15: terom@15: case URL_USERHOST_COLON: terom@15: // gah... terom@15: copy_to = &state->alnum; break; terom@15: terom@15: default: terom@15: FATAL("weird next token"); terom@15: } terom@15: terom@15: break; terom@15: terom@15: case URL_USERHOST_COLON: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_USERHOST_ALNUM2: terom@15: switch (next_token) { terom@15: case URL_USERNAME_END: terom@15: // store the username and password terom@15: state->url->username = state->alnum; state->alnum = NULL; terom@15: copy_to = &state->url->password; terom@15: terom@15: break; terom@15: terom@15: case URL_PATH_START: terom@15: case URL_OPT_START: terom@15: case LEX_EOF: terom@15: // store the service terom@15: copy_to = &state->url->service; break; terom@15: terom@15: default: terom@15: FATAL("weird next token"); terom@15: } terom@15: terom@15: break; terom@15: terom@15: case URL_USERNAME: terom@15: case URL_PASSWORD_SEP: terom@15: case URL_PASSWORD: terom@15: FATAL("these should be overshadowed"); terom@15: terom@15: case URL_USERNAME_END: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_HOSTNAME: terom@15: // store terom@15: copy_to = &state->url->hostname; break; terom@15: terom@15: case URL_SERVICE_SEP: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_SERVICE: terom@15: // store terom@15: copy_to = &state->url->service; break; terom@15: terom@15: case URL_PATH_START: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_PATH: terom@15: // store terom@15: copy_to = &state->url->path; break; terom@15: terom@15: case URL_OPT_START: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_OPT_KEY: terom@15: // store terom@15: if (_url_append_opt_key(state->url, token_data)) terom@15: goto error; terom@15: terom@15: break; terom@15: terom@15: case URL_OPT_EQ: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_OPT_VAL: terom@15: // store terom@15: if (_url_append_opt_val(state->url, token_data)) terom@15: goto error; terom@15: terom@15: break; terom@15: terom@15: case URL_OPT_SEP: terom@15: // ignore terom@15: break; terom@15: terom@15: default: terom@15: FATAL("invalid token"); terom@15: } terom@15: terom@15: if (copy_to) { terom@15: // copy the token data terom@15: if ((*copy_to = strdup(token_data)) == NULL) terom@15: ERROR("strdup"); terom@15: } terom@15: terom@15: // good terom@15: return 0; terom@15: terom@15: error: terom@15: // XXX: error codes? terom@15: return -1; terom@14: } terom@14: terom@14: static struct lex url_lex = { terom@15: .token_fn = url_lex_token, terom@15: .char_fn = NULL, terom@15: .end_fn = NULL, terom@15: terom@14: .state_count = URL_MAX, terom@14: .state_list = { terom@14: LEX_STATE ( URL_BEGIN ) { terom@14: LEX_ALNUM ( URL_BEGIN_ALNUM ), terom@14: LEX_CHAR ( ':', URL_SERVICE_SEP ), terom@14: LEX_CHAR ( '/', URL_PATH_START ), terom@14: LEX_CHAR ( '?', URL_OPT_START ), terom@14: LEX_END terom@14: }, terom@14: terom@14: // this can be URL_SCHEME, URL_USERNAME or URL_HOSTNAME terom@14: LEX_STATE_END ( URL_BEGIN_ALNUM ) { terom@14: LEX_ALNUM ( URL_BEGIN_ALNUM ), terom@14: LEX_CHAR ( '+', URL_SCHEME_SEP ), // it was URL_SCHEME terom@14: LEX_CHAR ( ':', URL_BEGIN_COLON ), terom@14: LEX_CHAR ( '@', URL_USERNAME_END ), // it was URL_USERNAME terom@14: LEX_CHAR ( '/', URL_PATH_START ), // it was URL_HOSTNAME terom@14: LEX_CHAR ( '?', URL_OPT_START ), // it was URL_HOSTNAME terom@14: LEX_END terom@14: }, terom@14: terom@14: // this can be URL_SCHEME_END_COL, URL_USERNAME_END or URL_SERVICE_SEP terom@14: LEX_STATE ( URL_BEGIN_COLON ) { terom@14: LEX_CHAR ( '/', URL_SCHEME_END_SLASH1 ), // it was URL_SCHEME terom@14: LEX_ALNUM ( URL_USERHOST_ALNUM2 ), terom@14: LEX_END terom@14: }, terom@14: terom@14: terom@14: LEX_STATE ( URL_SCHEME ) { terom@14: LEX_ALNUM ( URL_SCHEME ), terom@14: LEX_CHAR ( '+', URL_SCHEME_SEP ), terom@14: LEX_CHAR ( ':', URL_SCHEME_END_COL ), terom@14: LEX_END terom@14: }, terom@14: terom@14: LEX_STATE ( URL_SCHEME_SEP ) { terom@14: LEX_ALNUM ( URL_SCHEME ), terom@14: LEX_END terom@14: }, terom@14: terom@14: LEX_STATE ( URL_SCHEME_END_COL ) { terom@14: LEX_CHAR ( '/', URL_SCHEME_END_SLASH1 ), terom@14: LEX_END terom@14: }, terom@14: terom@14: LEX_STATE ( URL_SCHEME_END_SLASH1 ) { terom@14: LEX_CHAR ( '/', URL_SCHEME_END_SLASH2 ), terom@14: LEX_END terom@14: }, terom@14: terom@14: LEX_STATE_END ( URL_SCHEME_END_SLASH2 ) { terom@14: LEX_ALNUM ( URL_USERHOST_ALNUM ), terom@14: LEX_CHAR ( ':', URL_SERVICE_SEP ), terom@14: LEX_CHAR ( '/', URL_PATH_START ), terom@14: LEX_CHAR ( '?', URL_OPT_START ), terom@14: LEX_END terom@14: }, terom@14: terom@14: // this can be URL_USERNAME or URL_HOSTNAME terom@14: LEX_STATE_END ( URL_USERHOST_ALNUM ) { terom@14: LEX_ALNUM ( URL_USERHOST_ALNUM ), terom@14: LEX_CHAR ( ':', URL_USERHOST_COLON ), terom@14: LEX_CHAR ( '@', URL_USERNAME_END ), // it was URL_USERNAME terom@14: LEX_CHAR ( '/', URL_PATH_START ), // it was URL_HOSTNAME terom@14: LEX_CHAR ( '?', URL_OPT_START ), // it was URL_HOSTNAME terom@14: LEX_END terom@15: }, terom@14: terom@14: // this can be URL_USERNAME_END or URL_SERVICE_SEP terom@14: LEX_STATE ( URL_USERHOST_COLON ) { terom@14: LEX_ALNUM ( URL_USERHOST_ALNUM2 ), terom@14: LEX_END terom@14: }, terom@14: terom@14: // this can be URL_PASSWORD or URL_SERVICE terom@14: LEX_STATE_END ( URL_USERHOST_ALNUM2 ) { terom@14: LEX_ALNUM ( URL_USERHOST_ALNUM ), terom@14: LEX_CHAR ( '@', URL_USERNAME_END ), // it was URL_PASSSWORD terom@14: LEX_CHAR ( '/', URL_PATH_START ), // it was URL_SERVICE terom@14: LEX_CHAR ( '?', URL_OPT_START ), // it was URL_SERVICE terom@14: LEX_END terom@14: }, terom@14: terom@14: // dummy states, covered by URL_USERHOST_ALNUM/URL_USERHOST_COLON/URL_USERHOST_ALNUM2 terom@14: LEX_STATE ( URL_USERNAME ) { terom@14: LEX_END terom@14: }, terom@14: terom@14: LEX_STATE ( URL_PASSWORD_SEP ) { terom@14: LEX_END terom@14: }, terom@14: terom@14: LEX_STATE ( URL_PASSWORD ) { terom@14: LEX_END terom@14: }, terom@14: terom@14: terom@14: LEX_STATE_END ( URL_USERNAME_END ) { terom@14: LEX_ALNUM ( URL_HOSTNAME ), terom@14: LEX_CHAR ( ':', URL_SERVICE_SEP ), terom@14: LEX_CHAR ( '/', URL_PATH_START ), terom@14: LEX_CHAR ( '?', URL_OPT_START ), terom@14: LEX_END terom@14: }, terom@14: terom@14: terom@14: LEX_STATE_END ( URL_HOSTNAME ) { terom@14: LEX_ALNUM ( URL_HOSTNAME ), terom@14: LEX_CHAR ( ':', URL_SERVICE_SEP ), terom@14: LEX_CHAR ( '/', URL_PATH_START ), terom@14: LEX_CHAR ( '?', URL_OPT_START ), terom@14: LEX_END terom@14: }, terom@14: terom@14: terom@14: LEX_STATE ( URL_SERVICE_SEP ) { terom@14: LEX_ALNUM ( URL_SERVICE ), terom@14: LEX_CHAR ( '/', URL_PATH_START ), terom@14: LEX_CHAR ( '?', URL_OPT_START ), terom@14: LEX_END terom@14: }, terom@14: terom@14: LEX_STATE_END ( URL_SERVICE ) { terom@14: LEX_ALNUM ( URL_SERVICE ), terom@14: LEX_CHAR ( '/', URL_PATH_START ), terom@14: LEX_CHAR ( '?', URL_OPT_START ), terom@14: LEX_END terom@14: }, terom@14: terom@14: terom@14: LEX_STATE_END ( URL_PATH_START ) { terom@14: LEX_CHAR ( '?', URL_OPT_START ), terom@14: LEX_DEFAULT ( URL_PATH ), terom@14: }, terom@14: terom@14: LEX_STATE_END ( URL_PATH ) { terom@14: LEX_CHAR ( '?', URL_OPT_START ), terom@14: LEX_DEFAULT ( URL_PATH ), terom@14: }, terom@14: terom@14: terom@14: LEX_STATE_END ( URL_OPT_START ) { terom@14: LEX_CHAR ( '&', URL_OPT_SEP ), terom@15: LEX_INVALID ( '=' ), terom@14: LEX_DEFAULT ( URL_OPT_KEY ), terom@14: }, terom@14: terom@14: LEX_STATE_END ( URL_OPT_KEY ) { terom@14: LEX_CHAR ( '&', URL_OPT_SEP ), terom@14: LEX_CHAR ( '=', URL_OPT_EQ ), terom@14: LEX_DEFAULT ( URL_OPT_KEY ), terom@14: }, terom@14: terom@14: LEX_STATE_END ( URL_OPT_EQ ) { terom@14: LEX_CHAR ( '&', URL_OPT_SEP ), terom@15: LEX_INVALID ( '=' ), terom@14: LEX_DEFAULT ( URL_OPT_VAL ), terom@14: }, terom@14: terom@14: LEX_STATE_END ( URL_OPT_VAL ) { terom@14: LEX_CHAR ( '&', URL_OPT_SEP ), terom@15: LEX_INVALID ( '=' ), terom@14: LEX_DEFAULT ( URL_OPT_VAL ), terom@14: }, terom@14: terom@14: LEX_STATE_END ( URL_OPT_SEP ) { terom@14: LEX_CHAR ( '&', URL_OPT_SEP ), terom@15: LEX_INVALID ( '=' ), terom@14: LEX_DEFAULT ( URL_OPT_KEY ), terom@14: }, terom@14: terom@14: LEX_STATE ( URL_ERROR ) { terom@14: LEX_END terom@14: }, terom@15: } terom@14: }; terom@14: terom@14: int url_parse (struct url *url, const char *text) { terom@14: struct url_state state; ZINIT(state); terom@14: int ret; terom@14: terom@14: // set up state terom@14: state.url = url; terom@14: terom@14: // parse it terom@14: if ((ret = lexer(&url_lex, text, &state))) terom@14: ERROR("invalid URL"); terom@14: terom@14: // success terom@14: return 0; terom@14: terom@14: error: terom@14: return -1; terom@14: } terom@14: