--- a/src/lib/lexer.h Tue Oct 07 18:38:03 2008 +0300
+++ b/src/lib/lexer.h Tue Oct 07 20:31:35 2008 +0300
@@ -11,27 +11,46 @@
*/
/*
+ * Transition flags
+ */
+enum lex_transition_flags {
+ LEX_TRANS_DEFAULT = 0x01,
+ LEX_TRANS_FINAL = 0x02,
+};
+
+/*
* A transition from one state to another.
*/
struct lex_transition {
// applies to chars [left, right]
char left, right;
+ // flags from lex_transition_flags
+ char flags;
+
// next state to enter
int next_state;
};
/*
+ * State flags
+ */
+enum lex_state_flags {
+ LEX_STATE_END = 0x01;
+};
+
+/*
* A state
*/
struct lex_state {
// the state name (for debugging)
const char *name;
+ // flags from lex_state_flags
+ char flags;
+
// list of transitions for this state, terminated by a transition with next_state=0
struct lex_transition *trans_list;
-
-
};
/*
@@ -54,43 +73,46 @@
*
* Return zero to have lexing continue, nonzero to stop lexing.
*/
- int (*lex_token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg);
+ int (*token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg);
/*
* Called on every char handled by the lexer. `this_token` is the state of the token that the char belongs to.
*
* Return zero to have lexing continue, nonzero to stop lexing.
*/
- int (*lex_char_fn) (int this_token, char token_char, void *arg);
+ int (*char_fn) (int this_token, char token_char, void *arg);
/*
* Called when the end of input has been reached, `last_token` is the state that we terminated in.
*
* Return zero to indiciate that the input was valid, nonzero to indicate an error.
*/
- int (*lex_end_fn) (int last_token, void *arg);
+ int (*end_fn) (int last_token, void *arg);
};
/*
* Helper macros for building the state_list
*/
-#define LEX_STATE(enum_val) { #enum_val, {
+#define LEX_STATE(enum_val) { #enum_val, 0,
+#define LEX_STATE_END(enum_val) { #enum_val, LEX_STATE_END,
- #define LEX_CHAR(c, to) { c, c, to },
- #define LEX_RANGE(l, r, to) { l, r, to },
+ #define LEX_CHAR(c, to) { c, c, 0, to },
+ #define LEX_RANGE(l, r, to) { l, r, 0, to },
#define LEX_ALPHA(to) LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to)
#define LEX_NUMBER(to) LEX_RANGE('0', '9', to)
#define LEX_ALNUM(to) LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to)
#define LEX_WHITESPACE(to) LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to)
-#define LEX_STATE_END {0, 0, 0} \
- } }
+ #define LEX_DEFAULT(to) { 0, 0, LEX_TRANS_DEFAULT, to } \
+ }
+ #define LEX_END { 0, 0, 0, 0 } \
+ }
/*
* Lex it!
*
* Return zero to indiciate that the input was valid, nonzero otherwise.
*/
-int lexer (struct lex *lex, const char *input, void *arg);
+int lexer (const struct lex *lex, const char *input, void *arg);
#endif /* LIB_LEXER_H */
--- a/src/lib/url.c Tue Oct 07 18:38:03 2008 +0300
+++ b/src/lib/url.c Tue Oct 07 20:31:35 2008 +0300
@@ -2,14 +2,25 @@
#include "url.h"
#include "lexer.h"
-enum url_tokens {
+enum url_token {
URL_INVALID,
+ URL_BEGIN,
+
+ // kludge to resolve ambiguous URL_SCHEME/URL_USERNAME+URL_PASSWORD/URL_HOSTNAME+URL_SERVICE at the beginning
+ URL_BEGIN_ALNUM,
+ URL_BEGIN_COLON,
+
URL_SCHEME,
URL_SCHEME_SEP,
URL_SCHEME_END_COL,
URL_SCHEME_END_SLASH1,
URL_SCHEME_END_SLASH2,
+
+ // kludge to resolve ambiguous URL_USERNAME+URL_PASSWORD/URL_HOSTNAME+URL_SERVICE after a scheme
+ URL_USERHOST_ALNUM,
+ URL_USERHOST_COLON,
+ URL_USERHOST_ALNUM2,
URL_USERNAME,
URL_PASSWORD_SEP,
@@ -29,20 +40,231 @@
URL_OPT_EQ,
URL_OPT_VAL,
URL_OPT_SEP,
+
+ URL_END,
URL_MAX,
};
-static struct lex *url_lex = {
- .state_count = URL_MAX,
- .stae_list = {
- LEX_STATE(URL_SCHEME)
- LEX_ALNUM ( URL_SCHEME ),
- LEX_CHAR ( '+', URL_SCHEME_SEP ),
- LEX_STATE_END,
+/*
+ * Parser state
+ */
+struct url_state {
+ struct url *url;
+};
- },
+static int url_lex_token (int _this_token, char *token_data, int _next_token, int _prev_token, void *arg) {
+ enum url_token this_token = _this_token, next_token = _next_token, prev_token = _prev_token;
+ struct url_state *state = arg;
+
}
+static int url_lex_end (int _last_token, void *arg) {
+ enum url_token last_token = _last_token;
+ struct url_state *state = arg;
+
+}
+
+static struct lex url_lex = {
+ .state_count = URL_MAX,
+ .state_list = {
+ LEX_STATE ( URL_BEGIN ) {
+ LEX_ALNUM ( URL_BEGIN_ALNUM ),
+ LEX_CHAR ( ':', URL_SERVICE_SEP ),
+ LEX_CHAR ( '/', URL_PATH_START ),
+ LEX_CHAR ( '?', URL_OPT_START ),
+ LEX_END
+ },
+
+ // this can be URL_SCHEME, URL_USERNAME or URL_HOSTNAME
+ LEX_STATE_END ( URL_BEGIN_ALNUM ) {
+ LEX_ALNUM ( URL_BEGIN_ALNUM ),
+ LEX_CHAR ( '+', URL_SCHEME_SEP ), // it was URL_SCHEME
+ LEX_CHAR ( ':', URL_BEGIN_COLON ),
+ LEX_CHAR ( '@', URL_USERNAME_END ), // it was URL_USERNAME
+ LEX_CHAR ( '/', URL_PATH_START ), // it was URL_HOSTNAME
+ LEX_CHAR ( '?', URL_OPT_START ), // it was URL_HOSTNAME
+ LEX_END
+ },
+
+ // this can be URL_SCHEME_END_COL, URL_USERNAME_END or URL_SERVICE_SEP
+ LEX_STATE ( URL_BEGIN_COLON ) {
+ LEX_CHAR ( '/', URL_SCHEME_END_SLASH1 ), // it was URL_SCHEME
+ LEX_ALNUM ( URL_USERHOST_ALNUM2 ),
+ LEX_END
+ },
+
+
+ LEX_STATE ( URL_SCHEME ) {
+ LEX_ALNUM ( URL_SCHEME ),
+ LEX_CHAR ( '+', URL_SCHEME_SEP ),
+ LEX_CHAR ( ':', URL_SCHEME_END_COL ),
+ LEX_END
+ },
+
+ LEX_STATE ( URL_SCHEME_SEP ) {
+ LEX_ALNUM ( URL_SCHEME ),
+ LEX_END
+ },
+
+ LEX_STATE ( URL_SCHEME_END_COL ) {
+ LEX_CHAR ( '/', URL_SCHEME_END_SLASH1 ),
+ LEX_END
+ },
+
+ LEX_STATE ( URL_SCHEME_END_SLASH1 ) {
+ LEX_CHAR ( '/', URL_SCHEME_END_SLASH2 ),
+ LEX_END
+ },
+
+ LEX_STATE_END ( URL_SCHEME_END_SLASH2 ) {
+ LEX_ALNUM ( URL_USERHOST_ALNUM ),
+ LEX_CHAR ( ':', URL_SERVICE_SEP ),
+ LEX_CHAR ( '/', URL_PATH_START ),
+ LEX_CHAR ( '?', URL_OPT_START ),
+ LEX_END
+ },
+
+ // this can be URL_USERNAME or URL_HOSTNAME
+ LEX_STATE_END ( URL_USERHOST_ALNUM ) {
+ LEX_ALNUM ( URL_USERHOST_ALNUM ),
+ LEX_CHAR ( ':', URL_USERHOST_COLON ),
+ LEX_CHAR ( '@', URL_USERNAME_END ), // it was URL_USERNAME
+ LEX_CHAR ( '/', URL_PATH_START ), // it was URL_HOSTNAME
+ LEX_CHAR ( '?', URL_OPT_START ), // it was URL_HOSTNAME
+ LEX_END
+ }
+
+ // this can be URL_USERNAME_END or URL_SERVICE_SEP
+ LEX_STATE ( URL_USERHOST_COLON ) {
+ LEX_ALNUM ( URL_USERHOST_ALNUM2 ),
+ LEX_END
+ },
+
+ // this can be URL_PASSWORD or URL_SERVICE
+ LEX_STATE_END ( URL_USERHOST_ALNUM2 ) {
+ LEX_ALNUM ( URL_USERHOST_ALNUM ),
+ LEX_CHAR ( '@', URL_USERNAME_END ), // it was URL_PASSSWORD
+ LEX_CHAR ( '/', URL_PATH_START ), // it was URL_SERVICE
+ LEX_CHAR ( '?', URL_OPT_START ), // it was URL_SERVICE
+ LEX_END
+ },
+
+ // dummy states, covered by URL_USERHOST_ALNUM/URL_USERHOST_COLON/URL_USERHOST_ALNUM2
+ LEX_STATE ( URL_USERNAME ) {
+ LEX_END
+ },
+
+ LEX_STATE ( URL_PASSWORD_SEP ) {
+ LEX_END
+ },
+
+ LEX_STATE ( URL_PASSWORD ) {
+ LEX_END
+ },
+
+
+ LEX_STATE_END ( URL_USERNAME_END ) {
+ LEX_ALNUM ( URL_HOSTNAME ),
+ LEX_CHAR ( ':', URL_SERVICE_SEP ),
+ LEX_CHAR ( '/', URL_PATH_START ),
+ LEX_CHAR ( '?', URL_OPT_START ),
+ LEX_END
+ },
+
+
+ LEX_STATE_END ( URL_HOSTNAME ) {
+ LEX_ALNUM ( URL_HOSTNAME ),
+ LEX_CHAR ( ':', URL_SERVICE_SEP ),
+ LEX_CHAR ( '/', URL_PATH_START ),
+ LEX_CHAR ( '?', URL_OPT_START ),
+ LEX_END
+ },
+
+
+ LEX_STATE ( URL_SERVICE_SEP ) {
+ LEX_ALNUM ( URL_SERVICE ),
+ LEX_CHAR ( '/', URL_PATH_START ),
+ LEX_CHAR ( '?', URL_OPT_START ),
+ LEX_END
+ },
+
+ LEX_STATE_END ( URL_SERVICE ) {
+ LEX_ALNUM ( URL_SERVICE ),
+ LEX_CHAR ( '/', URL_PATH_START ),
+ LEX_CHAR ( '?', URL_OPT_START ),
+ LEX_END
+ },
+
+
+ LEX_STATE_END ( URL_PATH_START ) {
+ LEX_CHAR ( '?', URL_OPT_START ),
+ LEX_DEFAULT ( URL_PATH ),
+ },
+
+ LEX_STATE_END ( URL_PATH ) {
+ LEX_CHAR ( '?', URL_OPT_START ),
+ LEX_DEFAULT ( URL_PATH ),
+ },
+
+
+ LEX_STATE_END ( URL_OPT_START ) {
+ LEX_CHAR ( '&', URL_OPT_SEP ),
+ LEX_CHAR ( '=', URL_ERROR ),
+ LEX_DEFAULT ( URL_OPT_KEY ),
+ },
+
+ LEX_STATE_END ( URL_OPT_KEY ) {
+ LEX_CHAR ( '&', URL_OPT_SEP ),
+ LEX_CHAR ( '=', URL_OPT_EQ ),
+ LEX_DEFAULT ( URL_OPT_KEY ),
+ },
+
+ LEX_STATE_END ( URL_OPT_EQ ) {
+ LEX_CHAR ( '&', URL_OPT_SEP ),
+ LEX_DEFAULT ( URL_OPT_VAL ),
+ },
+
+ LEX_STATE_END ( URL_OPT_VAL ) {
+ LEX_CHAR ( '&', URL_OPT_SEP ),
+ LEX_DEFAULT ( URL_OPT_VAL ),
+ },
+
+ LEX_STATE_END ( URL_OPT_SEP ) {
+ LEX_CHAR ( '&', URL_OPT_SEP ),
+ LEX_CHAR ( '=', URL_ERROR ),
+ LEX_DEFAULT ( URL_OPT_KEY ),
+ },
+
+ LEX_STATE ( URL_ERROR ) {
+ LEX_END
+ },
+
+ URL_MAX,
+ },
+
+ .token_fn = url_lex_token,
+ .char_fn = NULL,
+ .end_fn = url_lex_end,
+};
+
+int url_parse (struct url *url, const char *text) {
+ struct url_state state; ZINIT(state);
+ int ret;
+
+ // set up state
+ state.url = url;
+
+ // parse it
+ if ((ret = lexer(&url_lex, text, &state)))
+ ERROR("invalid URL");
+
+ // success
+ return 0;
+
+error:
+ return -1;
+}
+