src/lib/url.c
author Tero Marttila <terom@fixme.fi>
Thu, 09 Oct 2008 00:33:37 +0300
changeset 16 74fb62022fb3
parent 15 a8d183e79ed9
child 17 0a024b29b16d
permissions -rw-r--r--
starting to work
#define _GNU_SOURCE
#include <stdlib.h>
#include <string.h>

#include "url.h"
#include "lex.h"
#include "error.h"
#include "log.h"
#include "misc.h"

enum url_token {
    URL_INVALID,
    
    URL_BEGIN,

    // kludge to resolve ambiguous URL_SCHEME/URL_USERNAME+URL_PASSWORD/URL_HOSTNAME+URL_SERVICE at the beginning
    URL_BEGIN_ALNUM,
    URL_BEGIN_COLON,

    URL_SCHEME,
    URL_SCHEME_SEP,
    URL_SCHEME_END_COL,
    URL_SCHEME_END_SLASH1,
    URL_SCHEME_END_SLASH2,

    // kludge to resolve ambiguous URL_USERNAME+URL_PASSWORD/URL_HOSTNAME+URL_SERVICE after a scheme 
    URL_USERHOST_ALNUM,
    URL_USERHOST_COLON,
    URL_USERHOST_ALNUM2,
    
    URL_USERNAME,
    URL_PASSWORD_SEP,
    URL_PASSWORD,
    URL_USERNAME_END,

    URL_HOSTNAME,

    URL_SERVICE_SEP,
    URL_SERVICE,

    URL_PATH_START,
    URL_PATH,

    URL_OPT_START,
    URL_OPT_KEY,
    URL_OPT_EQ,
    URL_OPT_VAL,
    URL_OPT_SEP,
    
    URL_MAX,
};

/*
 * Parser state
 */
struct url_state {
    // the URL to parse into
    struct url *url;
    
    // our lookahead-kludge
    const char *alnum, *alnum2;
    
};

static int _url_append_scheme (struct url *url, const char *data) {
    return 0;
}

static int _url_append_opt_key (struct url *url, const char *key) {
    return 0;
}

static int _url_append_opt_val (struct url *url, const char *value) {
    return 0;
}

static int url_lex_token (int _this_token, char *token_data, int _next_token, int _prev_token, void *arg);

static struct lex url_lex = {
    .token_fn = url_lex_token,
    .char_fn = NULL,
    .end_fn = NULL,

    .state_count = URL_MAX,
    .initial_state = URL_BEGIN,
    .state_list = {
        LEX_STATE ( URL_BEGIN ) {
            LEX_ALNUM       (           URL_BEGIN_ALNUM         ),
            LEX_CHAR        (   ':',    URL_SERVICE_SEP         ),
            LEX_CHAR        (   '/',    URL_PATH_START          ),
            LEX_CHAR        (   '?',    URL_OPT_START           ),
            LEX_END
        },
        
        // this can be URL_SCHEME, URL_USERNAME or URL_HOSTNAME
        LEX_STATE_END ( URL_BEGIN_ALNUM ) {
            LEX_ALNUM       (           URL_BEGIN_ALNUM         ),
            LEX_CHAR        (   '+',    URL_SCHEME_SEP          ),  // it was URL_SCHEME
            LEX_CHAR        (   ':',    URL_BEGIN_COLON         ), 
            LEX_CHAR        (   '@',    URL_USERNAME_END        ),  // it was URL_USERNAME
            LEX_CHAR        (   '/',    URL_PATH_START          ),  // it was URL_HOSTNAME
            LEX_CHAR        (   '?',    URL_OPT_START           ),  // it was URL_HOSTNAME
            LEX_END
        },
        
        // this can be URL_SCHEME_END_COL, URL_USERNAME_END or URL_SERVICE_SEP
        LEX_STATE ( URL_BEGIN_COLON ) {
            LEX_CHAR        (   '/',    URL_SCHEME_END_SLASH1   ),  // it was URL_SCHEME
            LEX_ALNUM       (           URL_USERHOST_ALNUM2     ),
            LEX_END
        },
       

        LEX_STATE ( URL_SCHEME ) { 
            LEX_ALNUM       (           URL_SCHEME              ),
            LEX_CHAR        (   '+',    URL_SCHEME_SEP          ),
            LEX_CHAR        (   ':',    URL_SCHEME_END_COL      ),
            LEX_END
        },

        LEX_STATE ( URL_SCHEME_SEP ) {
            LEX_ALNUM       (           URL_SCHEME              ),
            LEX_END
        },

        LEX_STATE ( URL_SCHEME_END_COL ) {
            LEX_CHAR        (   '/',    URL_SCHEME_END_SLASH1   ),
            LEX_END
        },

        LEX_STATE ( URL_SCHEME_END_SLASH1 ) {
            LEX_CHAR        (   '/',    URL_SCHEME_END_SLASH2   ),
            LEX_END
        },

        LEX_STATE_END ( URL_SCHEME_END_SLASH2 ) {
            LEX_ALNUM       (           URL_USERHOST_ALNUM      ),
            LEX_CHAR        (   ':',    URL_SERVICE_SEP         ),
            LEX_CHAR        (   '/',    URL_PATH_START          ),
            LEX_CHAR        (   '?',    URL_OPT_START           ),
            LEX_END
        },
        
        // this can be URL_USERNAME or URL_HOSTNAME
        LEX_STATE_END ( URL_USERHOST_ALNUM ) {
            LEX_CHAR        (   ':',    URL_USERHOST_COLON      ), 
            LEX_CHAR        (   '@',    URL_USERNAME_END        ),  // it was URL_USERNAME
            LEX_CHAR        (   '/',    URL_PATH_START          ),  // it was URL_HOSTNAME
            LEX_CHAR        (   '?',    URL_OPT_START           ),  // it was URL_HOSTNAME
            LEX_DEFAULT     (           URL_USERHOST_ALNUM      ),
        },
        
        // this can be URL_USERNAME_END or URL_SERVICE_SEP
        LEX_STATE ( URL_USERHOST_COLON ) {
            LEX_ALNUM       (           URL_USERHOST_ALNUM2        ),
            LEX_END
        },
        
        // this can be URL_PASSWORD or URL_SERVICE
        LEX_STATE_END ( URL_USERHOST_ALNUM2 ) {
            LEX_CHAR        (   '@',    URL_USERNAME_END        ),  // it was URL_PASSSWORD
            LEX_CHAR        (   '/',    URL_PATH_START          ),  // it was URL_SERVICE
            LEX_CHAR        (   '?',    URL_OPT_START           ),  // it was URL_SERVICE
            LEX_DEFAULT     (           URL_USERHOST_ALNUM2     ),
        },
        
        // dummy states, covered by URL_USERHOST_ALNUM/URL_USERHOST_COLON/URL_USERHOST_ALNUM2
        LEX_STATE ( URL_USERNAME ) {
            LEX_END
        },

        LEX_STATE ( URL_PASSWORD_SEP ) {
            LEX_END
        },

        LEX_STATE ( URL_PASSWORD ) {
            LEX_END
        },


        LEX_STATE_END ( URL_USERNAME_END ) {
            LEX_ALNUM       (           URL_HOSTNAME            ), 
            LEX_CHAR        (   ':',    URL_SERVICE_SEP         ),
            LEX_CHAR        (   '/',    URL_PATH_START          ),
            LEX_CHAR        (   '?',    URL_OPT_START           ),
            LEX_END
        },


        LEX_STATE_END ( URL_HOSTNAME ) {
            LEX_ALNUM       (           URL_HOSTNAME            ), 
            LEX_CHAR        (   ':',    URL_SERVICE_SEP         ),
            LEX_CHAR        (   '/',    URL_PATH_START          ),
            LEX_CHAR        (   '?',    URL_OPT_START           ),
            LEX_END
        },


        LEX_STATE ( URL_SERVICE_SEP ) {
            LEX_ALNUM       (           URL_SERVICE            ), 
            LEX_CHAR        (   '/',    URL_PATH_START          ),
            LEX_CHAR        (   '?',    URL_OPT_START           ),
            LEX_END
        },

        LEX_STATE_END ( URL_SERVICE ) {
            LEX_ALNUM       (           URL_SERVICE            ), 
            LEX_CHAR        (   '/',    URL_PATH_START          ),
            LEX_CHAR        (   '?',    URL_OPT_START           ),
            LEX_END
        },


        LEX_STATE_END ( URL_PATH_START ) {
            LEX_CHAR        (   '?',    URL_OPT_START           ),
            LEX_DEFAULT     (           URL_PATH                ),
        },

        LEX_STATE_END ( URL_PATH ) {
            LEX_CHAR        (   '?',    URL_OPT_START           ),
            LEX_DEFAULT     (           URL_PATH                ),
        },


        LEX_STATE_END ( URL_OPT_START ) {
            LEX_CHAR        (   '&',    URL_OPT_SEP             ),
            LEX_INVALID     (   '='                             ),
            LEX_DEFAULT     (           URL_OPT_KEY             ),
        },

        LEX_STATE_END ( URL_OPT_KEY ) {
            LEX_CHAR        (   '&',    URL_OPT_SEP             ),
            LEX_CHAR        (   '=',    URL_OPT_EQ              ),
            LEX_DEFAULT     (           URL_OPT_KEY             ),
        },

        LEX_STATE_END ( URL_OPT_EQ ) {
            LEX_CHAR        (   '&',    URL_OPT_SEP             ),
            LEX_INVALID     (   '='                             ),
            LEX_DEFAULT     (           URL_OPT_VAL             ),
        },

        LEX_STATE_END ( URL_OPT_VAL ) {
            LEX_CHAR        (   '&',    URL_OPT_SEP             ),
            LEX_INVALID     (   '='                             ),
            LEX_DEFAULT     (           URL_OPT_VAL             ),
        },

        LEX_STATE_END ( URL_OPT_SEP ) {
            LEX_CHAR        (   '&',    URL_OPT_SEP             ),
            LEX_INVALID     (   '='                             ),
            LEX_DEFAULT     (           URL_OPT_KEY             ),
        },
        
        LEX_STATE ( URL_ERROR ) {
            LEX_END
        },
    }
};

static int url_lex_token (int _this_token, char *token_data, int _next_token, int _prev_token, void *arg) {
    enum url_token this_token = _this_token, next_token = _next_token, prev_token = _prev_token;
    struct url_state *state = arg;
    const char **copy_to = NULL;

    (void) prev_token;
    
    switch (this_token) {
        case URL_BEGIN:
            // irrelevant
            break;

        case URL_BEGIN_ALNUM:
            switch (next_token) {
                case URL_SCHEME_SEP:
                    // store the scheme
                    if (_url_append_scheme(state->url, token_data))
                        goto error;
                    
                    break;
                
                case URL_USERNAME_END:
                    // store the username
                    copy_to = &state->url->username; break;
                
                case URL_PATH_START:
                case URL_OPT_START:
                case LEX_EOF:
                    // store the hostname
                    copy_to = &state->url->hostname; break;

                case URL_BEGIN_COLON:
                    // gah...
                    copy_to = &state->alnum; break;
                

                default:
                    FATAL("weird next token");
            }
            
            break;

        case URL_BEGIN_COLON:
            switch (next_token) {
                case URL_SCHEME_END_SLASH1:
                    // store the schema
                    if (_url_append_scheme(state->url, token_data))
                        goto error;

                    break;
                
                case URL_USERHOST_ALNUM2:
                    // gah..
                    break;

                default:
                    FATAL("weird next token");
            }

            break;

        case URL_SCHEME:
            // store the scheme
            if (_url_append_scheme(state->url, token_data))
                goto error;

            break;
    
        case URL_SCHEME_SEP:
            // ignore
            break;

        case URL_SCHEME_END_COL:
        case URL_SCHEME_END_SLASH1:
        case URL_SCHEME_END_SLASH2:
            // ignore
            break;
        
        case URL_USERHOST_ALNUM:
            switch (next_token) {
                case URL_USERNAME_END:
                    // store the username
                    copy_to = &state->url->username; break;
                
                case URL_PATH_START:
                case URL_OPT_START:
                case LEX_EOF:
                    // store the hostname
                    copy_to = &state->url->hostname; break;

                case URL_USERHOST_COLON:
                    // gah...
                    copy_to = &state->alnum; break;

                default:
                    FATAL("weird next token");
            }
            
            break;

        case URL_USERHOST_COLON:
            // ignore
            break;

        case URL_USERHOST_ALNUM2:
            switch (next_token) {
                case URL_USERNAME_END:
                    // store the username and password
                    state->url->username = state->alnum; state->alnum = NULL;
                    copy_to = &state->url->password;

                    break;

                case URL_PATH_START:
                case URL_OPT_START:
                case LEX_EOF:
                    // store the hostname and service
                    state->url->hostname = state->alnum; state->alnum = NULL;
                    copy_to = &state->url->service; break;

                default:
                    FATAL("weird next token");
            }

            break;

        case URL_USERNAME:
        case URL_PASSWORD_SEP:
        case URL_PASSWORD:
            FATAL("these should be overshadowed");
        
        case URL_USERNAME_END:
            // ignore
            break;

        case URL_HOSTNAME:
            // store
            copy_to = &state->url->hostname; break;

        case URL_SERVICE_SEP:
            // ignore
            break;

        case URL_SERVICE:
            // store
            copy_to = &state->url->service; break;
        
        case URL_PATH_START:
            // ignore
            break;

        case URL_PATH:
            // store
            copy_to = &state->url->path; break;

        case URL_OPT_START:
            // ignore
            break;

        case URL_OPT_KEY:
            // store
            if (_url_append_opt_key(state->url, token_data))
                goto error;

            break;

        case URL_OPT_EQ:
            // ignore
            break;

        case URL_OPT_VAL:
            // store
            if (_url_append_opt_val(state->url, token_data))
                goto error;

            break;
        
        case URL_OPT_SEP:
            // ignore
            break;
        
        default:
            ERROR("invalid token");
    }
    
    if (copy_to) {
        // copy the token data
        if ((*copy_to = strdup(token_data)) == NULL)
            ERROR("strdup");
    }

    // good
    return 0;

error:
    DEBUG("token: %s -> %s -> %s: %s", 
        LEX_STATE_NAME(&url_lex, prev_token), LEX_STATE_NAME(&url_lex, this_token), LEX_STATE_NAME(&url_lex, next_token),
        token_data
    );
    return -1;
}


int url_parse (struct url *url, const char *text) {
    struct url_state state; ZINIT(state);
    int ret;

    // set up state
    state.url = url;
    
    // parse it
    if ((ret = lexer(&url_lex, text, &state)))
        ERROR("invalid URL");

    // success
    return 0;

error:
    return -1;
}

static void _url_dump_part (const char *field, const char *val, FILE *stream) {
    if (val) {
        fprintf(stream, "%s=%s ", field, val);
    }
}

void url_dump (const struct url *url, FILE *stream) {
    int i;

    if (url->schema) {
        fprintf(stream, "schema=");

        for (i = 0; i < url->schema->count; i++) {
            if (i > 0)
                fprintf(stream, "+");

            fprintf(stream, "%s", url->schema->list[i]);
        }

        fprintf(stream, " ");
    }

    _url_dump_part("username", url->username, stream);
    _url_dump_part("password", url->password, stream);
    _url_dump_part("hostname", url->hostname, stream);
    _url_dump_part("service", url->service, stream);
    _url_dump_part("path", url->path, stream);

    if (url->opts) {
        fprintf(stream, "opts: ");

        for (i = 0; i < url->opts->count; i++) {
            fprintf(stream, "%s=%s ", url->opts->list[i].key, url->opts->list[i].value);
        }
    }

    fprintf(stream, "\n");
}