more intermediate work
authorTero Marttila <terom@fixme.fi>
Tue, 07 Oct 2008 20:31:35 +0300
changeset 14 115067dfba55
parent 13 385b9a10d096
child 15 a8d183e79ed9
more intermediate work
src/lib/lexer.h
src/lib/url.c
src/lib/url.h
--- a/src/lib/lexer.h	Tue Oct 07 18:38:03 2008 +0300
+++ b/src/lib/lexer.h	Tue Oct 07 20:31:35 2008 +0300
@@ -11,27 +11,46 @@
  */
 
 /*
+ * Transition flags
+ */
+enum lex_transition_flags {
+    LEX_TRANS_DEFAULT   = 0x01,
+    LEX_TRANS_FINAL     = 0x02,
+};
+
+/*
  * A transition from one state to another.
  */
 struct lex_transition {
     // applies to chars [left, right]
     char left, right;
     
+    // flags from lex_transition_flags
+    char flags;
+    
     // next state to enter
     int next_state;
 };
 
 /*
+ * State flags
+ */ 
+enum lex_state_flags {
+    LEX_STATE_END       = 0x01;
+};
+
+/*
  * A state
  */
 struct lex_state {
     // the state name (for debugging)
     const char *name;
 
+    // flags from lex_state_flags
+    char flags;
+
     // list of transitions for this state, terminated by a transition with next_state=0
     struct lex_transition *trans_list;
-
-
 };
 
 /*
@@ -54,43 +73,46 @@
      *
      * Return zero to have lexing continue, nonzero to stop lexing.
      */
-    int (*lex_token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg);
+    int (*token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg);
 
     /*
      * Called on every char handled by the lexer. `this_token` is the state of the token that the char belongs to.
      *
      * Return zero to have lexing continue, nonzero to stop lexing.
      */
-    int (*lex_char_fn) (int this_token, char token_char, void *arg);
+    int (*char_fn) (int this_token, char token_char, void *arg);
 
     /*
      * Called when the end of input has been reached, `last_token` is the state that we terminated in.
      *
      * Return zero to indiciate that the input was valid, nonzero to indicate an error.
      */
-    int (*lex_end_fn) (int last_token, void *arg);
+    int (*end_fn) (int last_token, void *arg);
 };
 
 /*
  * Helper macros for building the state_list
  */
-#define LEX_STATE(enum_val)     { #enum_val, {
+#define LEX_STATE(enum_val)     { #enum_val, 0,
+#define LEX_STATE_END(enum_val) { #enum_val, LEX_STATE_END,
 
-    #define LEX_CHAR(c, to)         { c, c, to },
-    #define LEX_RANGE(l, r, to)     { l, r, to },
+    #define LEX_CHAR(c, to)         { c, c, 0, to },
+    #define LEX_RANGE(l, r, to)     { l, r, 0, to },
     #define LEX_ALPHA(to)           LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to)
     #define LEX_NUMBER(to)          LEX_RANGE('0', '9', to)
     #define LEX_ALNUM(to)           LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to)
     #define LEX_WHITESPACE(to)      LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to)
 
-#define LEX_STATE_END               {0, 0, 0} \
-                                } }
+    #define LEX_DEFAULT(to)         { 0, 0, LEX_TRANS_DEFAULT, to } \
+                                  }
+    #define LEX_END                 { 0, 0, 0, 0 } \
+                                  }
 
 /*
  * Lex it!
  *
  * Return zero to indiciate that the input was valid, nonzero otherwise.
  */
-int lexer (struct lex *lex, const char *input, void *arg);
+int lexer (const struct lex *lex, const char *input, void *arg);
 
 #endif /* LIB_LEXER_H */
--- a/src/lib/url.c	Tue Oct 07 18:38:03 2008 +0300
+++ b/src/lib/url.c	Tue Oct 07 20:31:35 2008 +0300
@@ -2,14 +2,25 @@
 #include "url.h"
 #include "lexer.h"
 
-enum url_tokens {
+enum url_token {
     URL_INVALID,
     
+    URL_BEGIN,
+
+    // kludge to resolve ambiguous URL_SCHEME/URL_USERNAME+URL_PASSWORD/URL_HOSTNAME+URL_SERVICE at the beginning
+    URL_BEGIN_ALNUM,
+    URL_BEGIN_COLON,
+
     URL_SCHEME,
     URL_SCHEME_SEP,
     URL_SCHEME_END_COL,
     URL_SCHEME_END_SLASH1,
     URL_SCHEME_END_SLASH2,
+
+    // kludge to resolve ambiguous URL_USERNAME+URL_PASSWORD/URL_HOSTNAME+URL_SERVICE after a scheme 
+    URL_USERHOST_ALNUM,
+    URL_USERHOST_COLON,
+    URL_USERHOST_ALNUM2,
     
     URL_USERNAME,
     URL_PASSWORD_SEP,
@@ -29,20 +40,231 @@
     URL_OPT_EQ,
     URL_OPT_VAL,
     URL_OPT_SEP,
+    
+    URL_END,
 
     URL_MAX,
 };
 
-static struct lex *url_lex = {
-    .state_count = URL_MAX,
-    .stae_list = {
-        LEX_STATE(URL_SCHEME)
-            LEX_ALNUM       (           URL_SCHEME          ),
-            LEX_CHAR        (   '+',    URL_SCHEME_SEP      ),
-        LEX_STATE_END,
+/*
+ * Parser state
+ */
+struct url_state {
+    struct url *url;
 
 
+};
 
-    },
+static int url_lex_token (int _this_token, char *token_data, int _next_token, int _prev_token, void *arg) {
+    enum url_token this_token = _this_token, next_token = _next_token, prev_token = _prev_token;
+    struct url_state *state = arg;
+
 }
 
+static int url_lex_end (int _last_token, void *arg) {
+    enum url_token last_token = _last_token;
+    struct url_state *state = arg;
+
+}
+
+static struct lex url_lex = {
+    .state_count = URL_MAX,
+    .state_list = {
+        LEX_STATE ( URL_BEGIN ) {
+            LEX_ALNUM       (           URL_BEGIN_ALNUM         ),
+            LEX_CHAR        (   ':',    URL_SERVICE_SEP         ),
+            LEX_CHAR        (   '/',    URL_PATH_START          ),
+            LEX_CHAR        (   '?',    URL_OPT_START           ),
+            LEX_END
+        },
+        
+        // this can be URL_SCHEME, URL_USERNAME or URL_HOSTNAME
+        LEX_STATE_END ( URL_BEGIN_ALNUM ) {
+            LEX_ALNUM       (           URL_BEGIN_ALNUM         ),
+            LEX_CHAR        (   '+',    URL_SCHEME_SEP          ),  // it was URL_SCHEME
+            LEX_CHAR        (   ':',    URL_BEGIN_COLON         ), 
+            LEX_CHAR        (   '@',    URL_USERNAME_END        ),  // it was URL_USERNAME
+            LEX_CHAR        (   '/',    URL_PATH_START          ),  // it was URL_HOSTNAME
+            LEX_CHAR        (   '?',    URL_OPT_START           ),  // it was URL_HOSTNAME
+            LEX_END
+        },
+        
+        // this can be URL_SCHEME_END_COL, URL_USERNAME_END or URL_SERVICE_SEP
+        LEX_STATE ( URL_BEGIN_COLON ) {
+            LEX_CHAR        (   '/',    URL_SCHEME_END_SLASH1   ),  // it was URL_SCHEME
+            LEX_ALNUM       (           URL_USERHOST_ALNUM2     ),
+            LEX_END
+        },
+       
+
+        LEX_STATE ( URL_SCHEME ) { 
+            LEX_ALNUM       (           URL_SCHEME              ),
+            LEX_CHAR        (   '+',    URL_SCHEME_SEP          ),
+            LEX_CHAR        (   ':',    URL_SCHEME_END_COL      ),
+            LEX_END
+        },
+
+        LEX_STATE ( URL_SCHEME_SEP ) {
+            LEX_ALNUM       (           URL_SCHEME              ),
+            LEX_END
+        },
+
+        LEX_STATE ( URL_SCHEME_END_COL ) {
+            LEX_CHAR        (   '/',    URL_SCHEME_END_SLASH1   ),
+            LEX_END
+        },
+
+        LEX_STATE ( URL_SCHEME_END_SLASH1 ) {
+            LEX_CHAR        (   '/',    URL_SCHEME_END_SLASH2   ),
+            LEX_END
+        },
+
+        LEX_STATE_END ( URL_SCHEME_END_SLASH2 ) {
+            LEX_ALNUM       (           URL_USERHOST_ALNUM      ),
+            LEX_CHAR        (   ':',    URL_SERVICE_SEP         ),
+            LEX_CHAR        (   '/',    URL_PATH_START          ),
+            LEX_CHAR        (   '?',    URL_OPT_START           ),
+            LEX_END
+        },
+        
+        // this can be URL_USERNAME or URL_HOSTNAME
+        LEX_STATE_END ( URL_USERHOST_ALNUM ) {
+            LEX_ALNUM       (           URL_USERHOST_ALNUM      ),
+            LEX_CHAR        (   ':',    URL_USERHOST_COLON      ), 
+            LEX_CHAR        (   '@',    URL_USERNAME_END        ),  // it was URL_USERNAME
+            LEX_CHAR        (   '/',    URL_PATH_START          ),  // it was URL_HOSTNAME
+            LEX_CHAR        (   '?',    URL_OPT_START           ),  // it was URL_HOSTNAME
+            LEX_END
+        }
+        
+        // this can be URL_USERNAME_END or URL_SERVICE_SEP
+        LEX_STATE ( URL_USERHOST_COLON ) {
+            LEX_ALNUM       (           URL_USERHOST_ALNUM2        ),
+            LEX_END
+        },
+        
+        // this can be URL_PASSWORD or URL_SERVICE
+        LEX_STATE_END ( URL_USERHOST_ALNUM2 ) {
+            LEX_ALNUM       (           URL_USERHOST_ALNUM      ),
+            LEX_CHAR        (   '@',    URL_USERNAME_END        ),  // it was URL_PASSSWORD
+            LEX_CHAR        (   '/',    URL_PATH_START          ),  // it was URL_SERVICE
+            LEX_CHAR        (   '?',    URL_OPT_START           ),  // it was URL_SERVICE
+            LEX_END
+        },
+        
+        // dummy states, covered by URL_USERHOST_ALNUM/URL_USERHOST_COLON/URL_USERHOST_ALNUM2
+        LEX_STATE ( URL_USERNAME ) {
+            LEX_END
+        },
+
+        LEX_STATE ( URL_PASSWORD_SEP ) {
+            LEX_END
+        },
+
+        LEX_STATE ( URL_PASSWORD ) {
+            LEX_END
+        },
+
+
+        LEX_STATE_END ( URL_USERNAME_END ) {
+            LEX_ALNUM       (           URL_HOSTNAME            ), 
+            LEX_CHAR        (   ':',    URL_SERVICE_SEP         ),
+            LEX_CHAR        (   '/',    URL_PATH_START          ),
+            LEX_CHAR        (   '?',    URL_OPT_START           ),
+            LEX_END
+        },
+
+
+        LEX_STATE_END ( URL_HOSTNAME ) {
+            LEX_ALNUM       (           URL_HOSTNAME            ), 
+            LEX_CHAR        (   ':',    URL_SERVICE_SEP         ),
+            LEX_CHAR        (   '/',    URL_PATH_START          ),
+            LEX_CHAR        (   '?',    URL_OPT_START           ),
+            LEX_END
+        },
+
+
+        LEX_STATE ( URL_SERVICE_SEP ) {
+            LEX_ALNUM       (           URL_SERVICE            ), 
+            LEX_CHAR        (   '/',    URL_PATH_START          ),
+            LEX_CHAR        (   '?',    URL_OPT_START           ),
+            LEX_END
+        },
+
+        LEX_STATE_END ( URL_SERVICE ) {
+            LEX_ALNUM       (           URL_SERVICE            ), 
+            LEX_CHAR        (   '/',    URL_PATH_START          ),
+            LEX_CHAR        (   '?',    URL_OPT_START           ),
+            LEX_END
+        },
+
+
+        LEX_STATE_END ( URL_PATH_START ) {
+            LEX_CHAR        (   '?',    URL_OPT_START           ),
+            LEX_DEFAULT     (           URL_PATH                ),
+        },
+
+        LEX_STATE_END ( URL_PATH ) {
+            LEX_CHAR        (   '?',    URL_OPT_START           ),
+            LEX_DEFAULT     (           URL_PATH                ),
+        },
+
+
+        LEX_STATE_END ( URL_OPT_START ) {
+            LEX_CHAR        (   '&',    URL_OPT_SEP             ),
+            LEX_CHAR        (   '=',    URL_ERROR               ),
+            LEX_DEFAULT     (           URL_OPT_KEY             ),
+        },
+
+        LEX_STATE_END ( URL_OPT_KEY ) {
+            LEX_CHAR        (   '&',    URL_OPT_SEP             ),
+            LEX_CHAR        (   '=',    URL_OPT_EQ              ),
+            LEX_DEFAULT     (           URL_OPT_KEY             ),
+        },
+
+        LEX_STATE_END ( URL_OPT_EQ ) {
+            LEX_CHAR        (   '&',    URL_OPT_SEP             ),
+            LEX_DEFAULT     (           URL_OPT_VAL             ),
+        },
+
+        LEX_STATE_END ( URL_OPT_VAL ) {
+            LEX_CHAR        (   '&',    URL_OPT_SEP             ),
+            LEX_DEFAULT     (           URL_OPT_VAL             ),
+        },
+
+        LEX_STATE_END ( URL_OPT_SEP ) {
+            LEX_CHAR        (   '&',    URL_OPT_SEP             ),
+            LEX_CHAR        (   '=',    URL_ERROR               ),
+            LEX_DEFAULT     (           URL_OPT_KEY             ),
+        },
+        
+        LEX_STATE ( URL_ERROR ) {
+            LEX_END
+        },
+
+        URL_MAX,
+    },
+
+    .token_fn = url_lex_token,
+    .char_fn = NULL,
+    .end_fn = url_lex_end,
+};
+
+int url_parse (struct url *url, const char *text) {
+    struct url_state state; ZINIT(state);
+    int ret;
+
+    // set up state
+    state.url = url;
+    
+    // parse it
+    if ((ret = lexer(&url_lex, text, &state)))
+        ERROR("invalid URL");
+
+    // success
+    return 0;
+
+error:
+    return -1;
+}
+
--- a/src/lib/url.h	Tue Oct 07 18:38:03 2008 +0300
+++ b/src/lib/url.h	Tue Oct 07 20:31:35 2008 +0300
@@ -4,7 +4,7 @@
 /*
  * A trivial parser for simple URLs
  *
- * [ <scheme> [ "+" <scheme> [ ... ] ] "://" ] [ <username> [ ":" <password> ] "@" ] <hostname> [ ":" <service> ] [ "/" <path> ] [ "?" [ <key> [ "=" <value> ] ] [ "&" [ <key> [ "="     <value> ] ] [ ... ] ]
+ * [ <scheme> [ "+" <scheme> [ ... ] ] "://" ] [ <username> [ ":" <password> ] "@" ] [ <hostname> ] [ ":" <service> ] [ "/" <path> ] [ "?" [ <key> [ "=" <value> ] ] [ "&" [ <key> [ "="     <value> ] ] [ ... ] ]
  *
  *  example.com
  *  tcp://example.com:7348/
@@ -47,6 +47,8 @@
 /*
  * Parse the given `text` as an URL, returning the result in `url`. Optional fields that are missing in the text will
  * cause those values to be returned unmodified.
+ *
+ * Returns zero if the url was valid and was parsed, nonzero if it was invalid.
  */
 int url_parse (struct url *url, const char *text);