terom@15: #define _GNU_SOURCE terom@15: #include terom@15: #include terom@13: terom@13: #include "url.h" terom@15: #include "lex.h" terom@15: #include "error.h" terom@16: #include "log.h" terom@15: #include "misc.h" terom@13: terom@14: enum url_token { terom@13: URL_INVALID, terom@13: terom@14: URL_BEGIN, terom@14: terom@14: // kludge to resolve ambiguous URL_SCHEME/URL_USERNAME+URL_PASSWORD/URL_HOSTNAME+URL_SERVICE at the beginning terom@14: URL_BEGIN_ALNUM, terom@14: URL_BEGIN_COLON, terom@14: terom@13: URL_SCHEME, terom@13: URL_SCHEME_SEP, terom@13: URL_SCHEME_END_COL, terom@13: URL_SCHEME_END_SLASH1, terom@13: URL_SCHEME_END_SLASH2, terom@14: terom@14: // kludge to resolve ambiguous URL_USERNAME+URL_PASSWORD/URL_HOSTNAME+URL_SERVICE after a scheme terom@14: URL_USERHOST_ALNUM, terom@14: URL_USERHOST_COLON, terom@14: URL_USERHOST_ALNUM2, terom@13: terom@13: URL_USERNAME, terom@13: URL_PASSWORD_SEP, terom@13: URL_PASSWORD, terom@13: URL_USERNAME_END, terom@13: terom@13: URL_HOSTNAME, terom@13: terom@13: URL_SERVICE_SEP, terom@13: URL_SERVICE, terom@13: terom@13: URL_PATH_START, terom@13: URL_PATH, terom@13: terom@13: URL_OPT_START, terom@13: URL_OPT_KEY, terom@13: URL_OPT_EQ, terom@13: URL_OPT_VAL, terom@13: URL_OPT_SEP, terom@14: terom@13: URL_MAX, terom@13: }; terom@13: terom@14: /* terom@14: * Parser state terom@14: */ terom@14: struct url_state { terom@15: // the URL to parse into terom@14: struct url *url; terom@15: terom@15: // our lookahead-kludge terom@15: const char *alnum, *alnum2; terom@15: terom@15: }; terom@13: terom@17: static int _url_append_scheme (struct url *url, const char *data, int copy) { terom@17: if (!url->schema) { terom@17: if ((url->schema = malloc(sizeof(struct url_schema) + (1 * sizeof(const char *)))) == NULL) terom@17: ERROR("malloc"); terom@17: terom@17: url->schema->count = 1; terom@17: terom@17: } else { terom@17: url->schema->count++; terom@17: terom@17: // I'm starting to hate flexible array members... terom@17: if ((url->schema = realloc(url->schema, sizeof(struct url_schema) + url->schema->count * sizeof(const char *))) == NULL) terom@17: ERROR("realloc"); terom@17: } terom@17: terom@17: if ((url->schema->list[url->schema->count - 1] = copy ? strdup(data) : data) == NULL) terom@17: ERROR("strdup"); terom@17: terom@17: // k terom@16: return 0; terom@17: terom@17: error: terom@17: return -1; terom@15: } terom@13: terom@18: static struct url_opt *_url_get_opt (struct url *url, int new) { terom@18: if (!url->opts) { terom@18: if ((url->opts = malloc(sizeof(struct url_opts) + (1 * sizeof(struct url_opt)))) == NULL) terom@18: ERROR("malloc"); terom@18: terom@18: url->opts->count = 1; terom@18: terom@18: } else if (new) { terom@18: url->opts->count++; terom@18: terom@18: if ((url->opts = realloc(url->opts, sizeof(struct url_opts) + url->opts->count * sizeof(struct url_opt))) == NULL) terom@18: ERROR("realloc"); terom@18: } terom@18: terom@18: // success terom@18: return &url->opts->list[url->opts->count - 1]; terom@18: terom@18: error: terom@18: return NULL; terom@15: } terom@15: terom@18: static int _url_append_opt_key (struct url *url, const char *key) { terom@18: struct url_opt *opt; terom@18: terom@18: if ((opt = _url_get_opt(url, 1)) == NULL) terom@18: goto error; terom@18: terom@18: if ((opt->key = strdup(key)) == NULL) terom@18: ERROR("strdup"); terom@18: terom@18: opt->value = NULL; terom@18: terom@18: return 0; terom@18: terom@18: error: terom@18: return -1; terom@18: } terom@18: terom@15: static int _url_append_opt_val (struct url *url, const char *value) { terom@18: struct url_opt *opt; terom@18: terom@18: if ((opt = _url_get_opt(url, 0)) == NULL) terom@18: goto error; terom@18: terom@18: if ((opt->value = strdup(value)) == NULL) terom@18: ERROR("strdup"); terom@18: terom@16: return 0; terom@18: terom@18: error: terom@18: return -1; terom@16: } terom@15: terom@16: static int url_lex_token (int _this_token, char *token_data, int _next_token, int _prev_token, void *arg); terom@16: terom@16: static struct lex url_lex = { terom@16: .token_fn = url_lex_token, terom@16: .char_fn = NULL, terom@16: .end_fn = NULL, terom@16: terom@16: .state_count = URL_MAX, terom@16: .initial_state = URL_BEGIN, terom@16: .state_list = { terom@16: LEX_STATE ( URL_BEGIN ) { terom@16: LEX_ALNUM ( URL_BEGIN_ALNUM ), terom@16: LEX_CHAR ( ':', URL_SERVICE_SEP ), terom@16: LEX_CHAR ( '/', URL_PATH_START ), terom@16: LEX_CHAR ( '?', URL_OPT_START ), terom@16: LEX_END terom@16: }, terom@16: terom@16: // this can be URL_SCHEME, URL_USERNAME or URL_HOSTNAME terom@16: LEX_STATE_END ( URL_BEGIN_ALNUM ) { terom@16: LEX_CHAR ( '+', URL_SCHEME_SEP ), // it was URL_SCHEME terom@16: LEX_CHAR ( ':', URL_BEGIN_COLON ), terom@16: LEX_CHAR ( '@', URL_USERNAME_END ), // it was URL_USERNAME terom@16: LEX_CHAR ( '/', URL_PATH_START ), // it was URL_HOSTNAME terom@16: LEX_CHAR ( '?', URL_OPT_START ), // it was URL_HOSTNAME terom@18: LEX_DEFAULT ( URL_BEGIN_ALNUM ) terom@16: }, terom@16: terom@16: // this can be URL_SCHEME_END_COL, URL_USERNAME_END or URL_SERVICE_SEP terom@16: LEX_STATE ( URL_BEGIN_COLON ) { terom@16: LEX_CHAR ( '/', URL_SCHEME_END_SLASH1 ), // it was URL_SCHEME terom@16: LEX_ALNUM ( URL_USERHOST_ALNUM2 ), terom@16: LEX_END terom@16: }, terom@16: terom@16: terom@16: LEX_STATE ( URL_SCHEME ) { terom@16: LEX_ALNUM ( URL_SCHEME ), terom@16: LEX_CHAR ( '+', URL_SCHEME_SEP ), terom@16: LEX_CHAR ( ':', URL_SCHEME_END_COL ), terom@16: LEX_END terom@16: }, terom@16: terom@16: LEX_STATE ( URL_SCHEME_SEP ) { terom@16: LEX_ALNUM ( URL_SCHEME ), terom@16: LEX_END terom@16: }, terom@16: terom@16: LEX_STATE ( URL_SCHEME_END_COL ) { terom@16: LEX_CHAR ( '/', URL_SCHEME_END_SLASH1 ), terom@16: LEX_END terom@16: }, terom@16: terom@16: LEX_STATE ( URL_SCHEME_END_SLASH1 ) { terom@16: LEX_CHAR ( '/', URL_SCHEME_END_SLASH2 ), terom@16: LEX_END terom@16: }, terom@16: terom@16: LEX_STATE_END ( URL_SCHEME_END_SLASH2 ) { terom@16: LEX_ALNUM ( URL_USERHOST_ALNUM ), terom@16: LEX_CHAR ( ':', URL_SERVICE_SEP ), terom@16: LEX_CHAR ( '/', URL_PATH_START ), terom@16: LEX_CHAR ( '?', URL_OPT_START ), terom@16: LEX_END terom@16: }, terom@16: terom@16: // this can be URL_USERNAME or URL_HOSTNAME terom@16: LEX_STATE_END ( URL_USERHOST_ALNUM ) { terom@16: LEX_CHAR ( ':', URL_USERHOST_COLON ), terom@16: LEX_CHAR ( '@', URL_USERNAME_END ), // it was URL_USERNAME terom@16: LEX_CHAR ( '/', URL_PATH_START ), // it was URL_HOSTNAME terom@16: LEX_CHAR ( '?', URL_OPT_START ), // it was URL_HOSTNAME terom@16: LEX_DEFAULT ( URL_USERHOST_ALNUM ), terom@16: }, terom@16: terom@16: // this can be URL_USERNAME_END or URL_SERVICE_SEP terom@16: LEX_STATE ( URL_USERHOST_COLON ) { terom@16: LEX_ALNUM ( URL_USERHOST_ALNUM2 ), terom@16: LEX_END terom@16: }, terom@16: terom@16: // this can be URL_PASSWORD or URL_SERVICE terom@16: LEX_STATE_END ( URL_USERHOST_ALNUM2 ) { terom@16: LEX_CHAR ( '@', URL_USERNAME_END ), // it was URL_PASSSWORD terom@16: LEX_CHAR ( '/', URL_PATH_START ), // it was URL_SERVICE terom@16: LEX_CHAR ( '?', URL_OPT_START ), // it was URL_SERVICE terom@16: LEX_DEFAULT ( URL_USERHOST_ALNUM2 ), terom@16: }, terom@16: terom@16: // dummy states, covered by URL_USERHOST_ALNUM/URL_USERHOST_COLON/URL_USERHOST_ALNUM2 terom@16: LEX_STATE ( URL_USERNAME ) { terom@16: LEX_END terom@16: }, terom@16: terom@16: LEX_STATE ( URL_PASSWORD_SEP ) { terom@16: LEX_END terom@16: }, terom@16: terom@16: LEX_STATE ( URL_PASSWORD ) { terom@16: LEX_END terom@16: }, terom@16: terom@16: terom@16: LEX_STATE_END ( URL_USERNAME_END ) { terom@16: LEX_ALNUM ( URL_HOSTNAME ), terom@16: LEX_CHAR ( ':', URL_SERVICE_SEP ), terom@16: LEX_CHAR ( '/', URL_PATH_START ), terom@16: LEX_CHAR ( '?', URL_OPT_START ), terom@16: LEX_END terom@16: }, terom@16: terom@16: terom@16: LEX_STATE_END ( URL_HOSTNAME ) { terom@16: LEX_ALNUM ( URL_HOSTNAME ), terom@16: LEX_CHAR ( ':', URL_SERVICE_SEP ), terom@16: LEX_CHAR ( '/', URL_PATH_START ), terom@16: LEX_CHAR ( '?', URL_OPT_START ), terom@16: LEX_END terom@16: }, terom@16: terom@16: terom@16: LEX_STATE ( URL_SERVICE_SEP ) { terom@16: LEX_ALNUM ( URL_SERVICE ), terom@16: LEX_CHAR ( '/', URL_PATH_START ), terom@16: LEX_CHAR ( '?', URL_OPT_START ), terom@16: LEX_END terom@16: }, terom@16: terom@16: LEX_STATE_END ( URL_SERVICE ) { terom@16: LEX_ALNUM ( URL_SERVICE ), terom@16: LEX_CHAR ( '/', URL_PATH_START ), terom@16: LEX_CHAR ( '?', URL_OPT_START ), terom@16: LEX_END terom@16: }, terom@16: terom@16: terom@16: LEX_STATE_END ( URL_PATH_START ) { terom@16: LEX_CHAR ( '?', URL_OPT_START ), terom@16: LEX_DEFAULT ( URL_PATH ), terom@16: }, terom@16: terom@16: LEX_STATE_END ( URL_PATH ) { terom@16: LEX_CHAR ( '?', URL_OPT_START ), terom@16: LEX_DEFAULT ( URL_PATH ), terom@16: }, terom@16: terom@16: terom@16: LEX_STATE_END ( URL_OPT_START ) { terom@16: LEX_CHAR ( '&', URL_OPT_SEP ), terom@16: LEX_INVALID ( '=' ), terom@16: LEX_DEFAULT ( URL_OPT_KEY ), terom@16: }, terom@16: terom@16: LEX_STATE_END ( URL_OPT_KEY ) { terom@16: LEX_CHAR ( '&', URL_OPT_SEP ), terom@16: LEX_CHAR ( '=', URL_OPT_EQ ), terom@16: LEX_DEFAULT ( URL_OPT_KEY ), terom@16: }, terom@16: terom@16: LEX_STATE_END ( URL_OPT_EQ ) { terom@16: LEX_CHAR ( '&', URL_OPT_SEP ), terom@16: LEX_INVALID ( '=' ), terom@16: LEX_DEFAULT ( URL_OPT_VAL ), terom@16: }, terom@16: terom@16: LEX_STATE_END ( URL_OPT_VAL ) { terom@16: LEX_CHAR ( '&', URL_OPT_SEP ), terom@16: LEX_INVALID ( '=' ), terom@16: LEX_DEFAULT ( URL_OPT_VAL ), terom@16: }, terom@16: terom@16: LEX_STATE_END ( URL_OPT_SEP ) { terom@16: LEX_CHAR ( '&', URL_OPT_SEP ), terom@16: LEX_INVALID ( '=' ), terom@16: LEX_DEFAULT ( URL_OPT_KEY ), terom@16: }, terom@16: terom@16: LEX_STATE ( URL_ERROR ) { terom@16: LEX_END terom@16: }, terom@16: } terom@16: }; terom@13: terom@14: static int url_lex_token (int _this_token, char *token_data, int _next_token, int _prev_token, void *arg) { terom@14: enum url_token this_token = _this_token, next_token = _next_token, prev_token = _prev_token; terom@14: struct url_state *state = arg; terom@15: const char **copy_to = NULL; terom@13: terom@15: (void) prev_token; terom@15: terom@15: switch (this_token) { terom@16: case URL_BEGIN: terom@16: // irrelevant terom@16: break; terom@16: terom@15: case URL_BEGIN_ALNUM: terom@15: switch (next_token) { terom@15: case URL_SCHEME_SEP: terom@15: // store the scheme terom@17: if (_url_append_scheme(state->url, token_data, 1)) terom@15: goto error; terom@15: terom@15: break; terom@15: terom@15: case URL_USERNAME_END: terom@15: // store the username terom@15: copy_to = &state->url->username; break; terom@15: terom@15: case URL_PATH_START: terom@15: case URL_OPT_START: terom@15: case LEX_EOF: terom@15: // store the hostname terom@15: copy_to = &state->url->hostname; break; terom@14: terom@15: case URL_BEGIN_COLON: terom@15: // gah... terom@15: copy_to = &state->alnum; break; terom@15: terom@15: terom@15: default: terom@15: FATAL("weird next token"); terom@15: } terom@15: terom@15: break; terom@15: terom@15: case URL_BEGIN_COLON: terom@15: switch (next_token) { terom@15: case URL_SCHEME_END_SLASH1: terom@15: // store the schema terom@17: if (_url_append_scheme(state->url, state->alnum, 0)) terom@15: goto error; terom@17: terom@17: state->alnum = NULL; terom@15: terom@15: break; terom@15: terom@15: case URL_USERHOST_ALNUM2: terom@15: // gah.. terom@15: break; terom@15: terom@15: default: terom@15: FATAL("weird next token"); terom@15: } terom@15: terom@15: break; terom@15: terom@15: case URL_SCHEME: terom@15: // store the scheme terom@17: if (_url_append_scheme(state->url, token_data, 1)) terom@15: goto error; terom@15: terom@15: break; terom@15: terom@15: case URL_SCHEME_SEP: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_SCHEME_END_COL: terom@15: case URL_SCHEME_END_SLASH1: terom@15: case URL_SCHEME_END_SLASH2: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_USERHOST_ALNUM: terom@15: switch (next_token) { terom@15: case URL_USERNAME_END: terom@15: // store the username terom@15: copy_to = &state->url->username; break; terom@15: terom@15: case URL_PATH_START: terom@15: case URL_OPT_START: terom@15: case LEX_EOF: terom@15: // store the hostname terom@15: copy_to = &state->url->hostname; break; terom@15: terom@15: case URL_USERHOST_COLON: terom@15: // gah... terom@15: copy_to = &state->alnum; break; terom@15: terom@15: default: terom@15: FATAL("weird next token"); terom@15: } terom@15: terom@15: break; terom@15: terom@15: case URL_USERHOST_COLON: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_USERHOST_ALNUM2: terom@15: switch (next_token) { terom@15: case URL_USERNAME_END: terom@15: // store the username and password terom@15: state->url->username = state->alnum; state->alnum = NULL; terom@15: copy_to = &state->url->password; terom@15: terom@15: break; terom@15: terom@15: case URL_PATH_START: terom@15: case URL_OPT_START: terom@15: case LEX_EOF: terom@16: // store the hostname and service terom@16: state->url->hostname = state->alnum; state->alnum = NULL; terom@15: copy_to = &state->url->service; break; terom@15: terom@15: default: terom@15: FATAL("weird next token"); terom@15: } terom@15: terom@15: break; terom@15: terom@15: case URL_USERNAME: terom@15: case URL_PASSWORD_SEP: terom@15: case URL_PASSWORD: terom@15: FATAL("these should be overshadowed"); terom@15: terom@15: case URL_USERNAME_END: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_HOSTNAME: terom@15: // store terom@15: copy_to = &state->url->hostname; break; terom@15: terom@15: case URL_SERVICE_SEP: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_SERVICE: terom@15: // store terom@15: copy_to = &state->url->service; break; terom@15: terom@15: case URL_PATH_START: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_PATH: terom@15: // store terom@15: copy_to = &state->url->path; break; terom@15: terom@15: case URL_OPT_START: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_OPT_KEY: terom@15: // store terom@15: if (_url_append_opt_key(state->url, token_data)) terom@15: goto error; terom@15: terom@15: break; terom@15: terom@15: case URL_OPT_EQ: terom@15: // ignore terom@15: break; terom@15: terom@15: case URL_OPT_VAL: terom@15: // store terom@15: if (_url_append_opt_val(state->url, token_data)) terom@15: goto error; terom@15: terom@15: break; terom@15: terom@15: case URL_OPT_SEP: terom@15: // ignore terom@15: break; terom@15: terom@15: default: terom@16: ERROR("invalid token"); terom@15: } terom@15: terom@15: if (copy_to) { terom@15: // copy the token data terom@15: if ((*copy_to = strdup(token_data)) == NULL) terom@15: ERROR("strdup"); terom@15: } terom@15: terom@15: // good terom@15: return 0; terom@15: terom@15: error: terom@16: DEBUG("token: %s -> %s -> %s: %s", terom@16: LEX_STATE_NAME(&url_lex, prev_token), LEX_STATE_NAME(&url_lex, this_token), LEX_STATE_NAME(&url_lex, next_token), terom@16: token_data terom@16: ); terom@15: return -1; terom@14: } terom@14: terom@14: terom@14: int url_parse (struct url *url, const char *text) { terom@14: struct url_state state; ZINIT(state); terom@14: int ret; terom@14: terom@14: // set up state terom@14: state.url = url; terom@14: terom@14: // parse it terom@14: if ((ret = lexer(&url_lex, text, &state))) terom@14: ERROR("invalid URL"); terom@14: terom@14: // success terom@14: return 0; terom@14: terom@14: error: terom@14: return -1; terom@14: } terom@14: terom@16: static void _url_dump_part (const char *field, const char *val, FILE *stream) { terom@16: if (val) { terom@16: fprintf(stream, "%s=%s ", field, val); terom@16: } terom@16: } terom@16: terom@16: void url_dump (const struct url *url, FILE *stream) { terom@16: int i; terom@16: terom@16: if (url->schema) { terom@17: fprintf(stream, "schema=("); terom@16: terom@16: for (i = 0; i < url->schema->count; i++) { terom@16: if (i > 0) terom@17: fprintf(stream, ","); terom@16: terom@16: fprintf(stream, "%s", url->schema->list[i]); terom@16: } terom@16: terom@17: fprintf(stream, ") "); terom@16: } terom@16: terom@16: _url_dump_part("username", url->username, stream); terom@16: _url_dump_part("password", url->password, stream); terom@16: _url_dump_part("hostname", url->hostname, stream); terom@16: _url_dump_part("service", url->service, stream); terom@16: _url_dump_part("path", url->path, stream); terom@16: terom@16: if (url->opts) { terom@16: fprintf(stream, "opts: "); terom@16: terom@16: for (i = 0; i < url->opts->count; i++) { terom@16: fprintf(stream, "%s=%s ", url->opts->list[i].key, url->opts->list[i].value); terom@16: } terom@16: } terom@16: terom@16: fprintf(stream, "\n"); terom@16: } terom@16: