inital playing around with a lexer/url parser
authorTero Marttila <terom@fixme.fi>
Tue, 07 Oct 2008 18:38:03 +0300
changeset 13 385b9a10d096
parent 12 7f159ee3a3ff
child 14 115067dfba55
inital playing around with a lexer/url parser
src/lib/lexer.h
src/lib/url.c
src/lib/url.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/lexer.h	Tue Oct 07 18:38:03 2008 +0300
@@ -0,0 +1,96 @@
+#ifndef LIB_LEXER_H
+#define LIB_LEXER_H
+
+/*
+ * Simple FSM lexing
+ *
+ * The lexer is implemented as a Finite State Machine, consisting for a number of states, which then contain a set of
+ * transitions, which move the lexer from state to state based on each char of input at a time.
+ *
+ * Whenever the state changes, the token callback is triggered with the collected token data.
+ */
+
+/*
+ * A transition from one state to another.
+ */
+struct lex_transition {
+    // applies to chars [left, right]
+    char left, right;
+    
+    // next state to enter
+    int next_state;
+};
+
+/*
+ * A state
+ */
+struct lex_state {
+    // the state name (for debugging)
+    const char *name;
+
+    // list of transitions for this state, terminated by a transition with next_state=0
+    struct lex_transition *trans_list;
+
+
+};
+
+/*
+ * Lex machine
+ */
+struct lex {
+    // number of states
+    size_t state_count;
+
+    // array of lex_states, indexable by the state id.
+    struct lex_state *state_list;
+
+    /*
+     * Core token handler. Everytime a full token is lexed (i.e. the state changes), this will be called.
+     * `this_token` represents the full token that was parsed, and `token_data` is the token's value. `next_token`
+     * is the state that terminated this token, and `prev_token` was the token before this one.
+     *
+     * `token_data` is a buffer allocated by the lexer that the actual input data is copied into. Thence, it can be
+     * modified, as its contents will be replaced by the next token. Hence, if you need to keep hold of it, copy it.
+     *
+     * Return zero to have lexing continue, nonzero to stop lexing.
+     */
+    int (*lex_token_fn) (int this_token, char *token_data, int next_token, int prev_token, void *arg);
+
+    /*
+     * Called on every char handled by the lexer. `this_token` is the state of the token that the char belongs to.
+     *
+     * Return zero to have lexing continue, nonzero to stop lexing.
+     */
+    int (*lex_char_fn) (int this_token, char token_char, void *arg);
+
+    /*
+     * Called when the end of input has been reached, `last_token` is the state that we terminated in.
+     *
+     * Return zero to indiciate that the input was valid, nonzero to indicate an error.
+     */
+    int (*lex_end_fn) (int last_token, void *arg);
+};
+
+/*
+ * Helper macros for building the state_list
+ */
+#define LEX_STATE(enum_val)     { #enum_val, {
+
+    #define LEX_CHAR(c, to)         { c, c, to },
+    #define LEX_RANGE(l, r, to)     { l, r, to },
+    #define LEX_ALPHA(to)           LEX_RANGE('a', 'z', to), LEX_RANGE('A', 'Z', to)
+    #define LEX_NUMBER(to)          LEX_RANGE('0', '9', to)
+    #define LEX_ALNUM(to)           LEX_ALPHA(to), LEX_NUMBER(to), LEX_CHAR('-', to), LEX_CHAR('_', to)
+    #define LEX_WHITESPACE(to)      LEX_CHAR(' ', to), LEX_CHAR('\n', to), LEX_CHAR('\t', to)
+
+#define LEX_STATE_END               {0, 0, 0} \
+                                } }
+
+/*
+ * Lex it!
+ *
+ * Return zero to indiciate that the input was valid, nonzero otherwise.
+ */
+int lexer (struct lex *lex, const char *input, void *arg);
+
+#endif /* LIB_LEXER_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/url.c	Tue Oct 07 18:38:03 2008 +0300
@@ -0,0 +1,48 @@
+
+#include "url.h"
+#include "lexer.h"
+
+enum url_tokens {
+    URL_INVALID,
+    
+    URL_SCHEME,
+    URL_SCHEME_SEP,
+    URL_SCHEME_END_COL,
+    URL_SCHEME_END_SLASH1,
+    URL_SCHEME_END_SLASH2,
+    
+    URL_USERNAME,
+    URL_PASSWORD_SEP,
+    URL_PASSWORD,
+    URL_USERNAME_END,
+
+    URL_HOSTNAME,
+
+    URL_SERVICE_SEP,
+    URL_SERVICE,
+
+    URL_PATH_START,
+    URL_PATH,
+
+    URL_OPT_START,
+    URL_OPT_KEY,
+    URL_OPT_EQ,
+    URL_OPT_VAL,
+    URL_OPT_SEP,
+
+    URL_MAX,
+};
+
+static struct lex *url_lex = {
+    .state_count = URL_MAX,
+    .stae_list = {
+        LEX_STATE(URL_SCHEME)
+            LEX_ALNUM       (           URL_SCHEME          ),
+            LEX_CHAR        (   '+',    URL_SCHEME_SEP      ),
+        LEX_STATE_END,
+
+
+
+    },
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/url.h	Tue Oct 07 18:38:03 2008 +0300
@@ -0,0 +1,53 @@
+#ifndef LIB_URL_H
+#define LIB_URL_H
+
+/*
+ * A trivial parser for simple URLs
+ *
+ * [ <scheme> [ "+" <scheme> [ ... ] ] "://" ] [ <username> [ ":" <password> ] "@" ] <hostname> [ ":" <service> ] [ "/" <path> ] [ "?" [ <key> [ "=" <value> ] ] [ "&" [ <key> [ "="     <value> ] ] [ ... ] ]
+ *
+ *  example.com
+ *  tcp://example.com:7348/
+ *  psql://postgres@localhost/test_db?charset=utf8
+ *  
+ */
+
+/*
+ * The schema
+ */
+struct url_schema {
+    size_t count;
+    const char **list;
+};
+
+/*
+ * The options at the end
+ */
+struct url_opts {
+    size_t count;
+    struct url_opt {
+        const char *key;
+        const char *value;
+    } *list;
+};
+
+/*
+ * A parsed URL
+ */
+struct url {
+    struct url_schema *schema;
+    const char *username;
+    const char *password;
+    const char *hostname;
+    const char *service;
+    const char *path;
+    struct url_opts *opts;
+};
+
+/*
+ * Parse the given `text` as an URL, returning the result in `url`. Optional fields that are missing in the text will
+ * cause those values to be returned unmodified.
+ */
+int url_parse (struct url *url, const char *text);
+
+#endif /* LIB_URL_H */