| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967 | /******************************************************************************* copyright: Copyright (c) 2004 Kris Bell. All rights reserved license: BSD style: $(LICENSE) version: Initial release: April 2004 author: Kris *******************************************************************************/ module tango.net.Uri; public import tango.net.model.UriView; //public alias Uri UriView; private import tango.core.Exception; private import Integer = tango.text.convert.Integer; private import tango.stdc.string : memchr; /******************************************************************************* Implements an RFC 2396 compliant URI specification. See <A HREF="http://ftp.ics.uci.edu/pub/ietf/uri/rfc2396.txt">this page</A> for more information. The implementation fails the spec on two counts: it doesn't insist on a scheme being present in the Uri, and it doesn't implement the "Relative References" support noted in section 5.2. The latter can be found in tango.util.PathUtil instead. Note that IRI support can be implied by assuming each of userinfo, path, query, and fragment are UTF-8 encoded (see <A HREF="http://www.w3.org/2001/Talks/0912-IUC-IRI/paper.html"> this page</A> for further details). *******************************************************************************/ class Uri : UriView { // simplistic string appender private alias scope size_t delegate(const(void)[]) Consumer; /// old method names public alias port getPort; public alias defaultPort getDefaultPort; public alias scheme getScheme; public alias host getHost; public alias validPort getValidPort; public alias userinfo getUserInfo; public alias path getPath; public alias query getQuery; public alias fragment getFragment; public alias port setPort; public alias scheme setScheme; public alias host setHost; public alias userinfo setUserInfo; public alias query setQuery; public alias path setPath; public alias fragment setFragment; public enum {InvalidPort = -1} private int port_; private const(char)[] host_, path_, query_, scheme_, userinfo_, fragment_; private HeapSlice decoded; private static __gshared ubyte map[]; private static __gshared short[immutable(char)[]] genericSchemes; private enum immutable(char)[] hexDigits = "0123456789abcdef"; private enum SchemePort[] schemePorts = [ {"coffee", 80}, {"file", InvalidPort}, {"ftp", 21}, {"gopher", 70}, {"hnews", 80}, {"http", 80}, {"http-ng", 80}, {"https", 443}, {"imap", 143}, {"irc", 194}, {"ldap", 389}, {"news", 119}, {"nfs", 2049}, {"nntp", 119}, {"pop", 110}, {"rwhois", 4321}, {"shttp", 80}, {"smtp", 25}, {"snews", 563}, {"telnet", 23}, {"wais", 210}, {"whois", 43}, {"whois++", 43}, ]; public enum { ExcScheme = 0x01, ExcAuthority = 0x02, ExcPath = 0x04, IncUser = 0x80, // encode spec for User IncPath = 0x10, // encode spec for Path IncQuery = 0x20, // encode spec for Query IncQueryAll = 0x40, IncScheme = 0x80, // encode spec for Scheme IncGeneric = IncScheme | IncUser | IncPath | IncQuery | IncQueryAll } // scheme and port pairs private struct SchemePort { const(char)[] name; short port; } /*********************************************************************** Initialize the Uri character maps and so on ***********************************************************************/ shared static this () { // Map known generic schemes to their default port. Specify // InvalidPort for those schemes that don't use ports. Note // that a port value of zero is not supported ... foreach (SchemePort sp; schemePorts) genericSchemes[sp.name] = sp.port; genericSchemes.rehash; map = new ubyte[256]; // load the character map with valid symbols for (int i='a'; i <= 'z'; ++i) map[i] = IncGeneric; for (int i='A'; i <= 'Z'; ++i) map[i] = IncGeneric; for (int i='0'; i<='9'; ++i) map[i] = IncGeneric; // exclude these from parsing elements map[':'] |= ExcScheme; map['/'] |= ExcScheme | ExcAuthority; map['?'] |= ExcScheme | ExcAuthority | ExcPath; map['#'] |= ExcScheme | ExcAuthority | ExcPath; // include these as common (unreserved) symbols map['-'] |= IncUser | IncQuery | IncQueryAll | IncPath; map['_'] |= IncUser | IncQuery | IncQueryAll | IncPath; map['.'] |= IncUser | IncQuery | IncQueryAll | IncPath; map['!'] |= IncUser | IncQuery | IncQueryAll | IncPath; map['~'] |= IncUser | IncQuery | IncQueryAll | IncPath; map['*'] |= IncUser | IncQuery | IncQueryAll | IncPath; map['\''] |= IncUser | IncQuery | IncQueryAll | IncPath; map['('] |= IncUser | IncQuery | IncQueryAll | IncPath; map[')'] |= IncUser | IncQuery | IncQueryAll | IncPath; // include these as scheme symbols map['+'] |= IncScheme; map['-'] |= IncScheme; map['.'] |= IncScheme; // include these as userinfo symbols map[';'] |= IncUser; map[':'] |= IncUser; map['&'] |= IncUser; map['='] |= IncUser; map['+'] |= IncUser; map['$'] |= IncUser; map[','] |= IncUser; // include these as path symbols map['/'] |= IncPath; map[';'] |= IncPath; map[':'] |= IncPath; map['@'] |= IncPath; map['&'] |= IncPath; map['='] |= IncPath; map['+'] |= IncPath; map['$'] |= IncPath; map[','] |= IncPath; // include these as query symbols map[';'] |= IncQuery | IncQueryAll; map['/'] |= IncQuery | IncQueryAll; map[':'] |= IncQuery | IncQueryAll; map['@'] |= IncQuery | IncQueryAll; map['='] |= IncQuery | IncQueryAll; map['$'] |= IncQuery | IncQueryAll; map[','] |= IncQuery | IncQueryAll; // '%' are permitted inside queries when constructing output map['%'] |= IncQueryAll; map['?'] |= IncQueryAll; map['&'] |= IncQueryAll; } /*********************************************************************** Create an empty Uri ***********************************************************************/ this () { port_ = InvalidPort; decoded.expand (512); } /*********************************************************************** Construct a Uri from the provided character string ***********************************************************************/ this (const(char)[] uri) { this (); parse (uri); } /*********************************************************************** Construct a Uri from the given components. The query is optional. ***********************************************************************/ this (const(char)[] scheme, const(char)[] host, const(char)[] path, const(char)[] query = null) { this (); this.scheme_ = scheme; this.query_ = query; this.host_ = host; this.path_ = path; } /*********************************************************************** Clone another Uri. This can be used to make a mutable Uri from an immutable UriView. ***********************************************************************/ this (UriView other) { with (other) { this (getScheme(), getHost(), getPath(), getQuery()); this.userinfo_ = getUserInfo(); this.fragment_ = getFragment(); this.port_ = getPort(); } } /*********************************************************************** Return the default port for the given scheme. InvalidPort is returned if the scheme is unknown, or does not accept a port. ***********************************************************************/ override final const int defaultPort (const(char)[] scheme) { short* port = scheme in genericSchemes; if (port is null) return InvalidPort; return *port; } /*********************************************************************** Return the parsed scheme, or null if the scheme was not specified ***********************************************************************/ @property override final const const(char)[] scheme() { return scheme_; } /*********************************************************************** Return the parsed host, or null if the host was not specified ***********************************************************************/ @property override final const const(char)[] host() { return host_; } /*********************************************************************** Return the parsed port number, or InvalidPort if the port was not provided. ***********************************************************************/ @property override final const int port() { return port_; } /*********************************************************************** Return a valid port number by performing a lookup on the known schemes if the port was not explicitly specified. ***********************************************************************/ override final const int validPort() { if (port_ is InvalidPort) return defaultPort (scheme_); return port_; } /*********************************************************************** Return the parsed userinfo, or null if userinfo was not provided. ***********************************************************************/ @property override final const const(char)[] userinfo() { return userinfo_; } /*********************************************************************** Return the parsed path, or null if the path was not provided. ***********************************************************************/ @property override final const const(char)[] path() { return path_; } /*********************************************************************** Return the parsed query, or null if a query was not provided. ***********************************************************************/ @property override final const const(char)[] query() { return query_; } /*********************************************************************** Return the parsed fragment, or null if a fragment was not provided. ***********************************************************************/ @property override final const const(char)[] fragment() { return fragment_; } /*********************************************************************** Return whether or not the Uri scheme is considered generic. ***********************************************************************/ @property override final const bool isGeneric () { return (scheme_ in genericSchemes) !is null; } /*********************************************************************** Emit the content of this Uri via the provided Consumer. The output is constructed per RFC 2396. ***********************************************************************/ final const size_t produce (Consumer consume) { size_t ret; if (scheme_.length) ret += consume (scheme_), ret += consume (":"); if (userinfo_.length || host_.length || port_ != InvalidPort) { ret += consume ("//"); if (userinfo_.length) ret += encode (consume, userinfo_, IncUser), ret +=consume ("@"); if (host_.length) ret += consume (host_); if (port_ != InvalidPort && port_ != getDefaultPort(scheme_)) { char[8] tmp; ret += consume (":"), ret += consume (Integer.itoa (tmp, cast(uint) port_)); } } if (path_.length) ret += encode (consume, path_, IncPath); if (query_.length) { ret += consume ("?"); ret += encode (consume, query_, IncQueryAll); } if (fragment_.length) { ret += consume ("#"); ret += encode (consume, fragment_, IncQuery); } return ret; } /*********************************************************************** Emit the content of this Uri via the provided Consumer. The output is constructed per RFC 2396. ***********************************************************************/ override final string toString () { immutable(void)[] s; s.length = 256, s.length = 0; produce ((const(void)[] v) {return s ~= v, v.length;}); return cast(immutable(char)[]) s; } /*********************************************************************** Encode uri characters into a Consumer, such that reserved chars are converted into their %hex version. ***********************************************************************/ static size_t encode (Consumer consume, const(char)[] s, int flags) { size_t ret; char[3] hex; int mark; hex[0] = '%'; foreach (int i, char c; s) { if (! (map[c] & flags)) { ret += consume (s[mark..i]); mark = i+1; hex[1] = hexDigits [(c >> 4) & 0x0f]; hex[2] = hexDigits [c & 0x0f]; ret += consume (hex); } } // add trailing section if (mark < s.length) ret += consume (s[mark..s.length]); return ret; } /*********************************************************************** Encode uri characters into a string, such that reserved chars are converted into their %hex version. Returns a dup'd string ***********************************************************************/ static char[] encode (const(char)[] text, int flags) { void[] s; encode ((const(void)[] v) {return s ~= v, v.length;}, text, flags); return cast(char[]) s; } /*********************************************************************** Decode a character string with potential %hex values in it. The decoded strings are placed into a thread-safe expanding buffer, and a slice of it is returned to the caller. ***********************************************************************/ private const(char)[] decoder (const(char)[] s, char ignore=0) { static int toInt (char c) { if (c >= '0' && c <= '9') c -= '0'; else if (c >= 'a' && c <= 'f') c -= ('a' - 10); else if (c >= 'A' && c <= 'F') c -= ('A' - 10); return c; } auto length = s.length; // take a peek first, to see if there's work to do if (length && memchr (s.ptr, '%', length)) { char* p; int j; // ensure we have enough decoding space available p = cast(char*) decoded.expand (length); // scan string, stripping % encodings as we go for (auto i = 0; i < length; ++i, ++j, ++p) { int c = s[i]; if (c is '%' && (i+2) < length) { c = toInt(s[i+1]) * 16 + toInt(s[i+2]); // leave ignored escapes in the stream, // permitting escaped '&' to remain in // the query string if (c && (c is ignore)) c = '%'; else i += 2; } *p = cast(char)c; } // return a slice from the decoded input return cast(char[]) decoded.slice (j); } // return original content return s; } /*********************************************************************** Decode a duplicated string with potential %hex values in it ***********************************************************************/ final char[] decode (const(char)[] s) { return decoder(s).dup; } /*********************************************************************** Parsing is performed according to RFC 2396 <pre> ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 12 3 4 5 6 7 8 9 2 isolates scheme 4 isolates authority 5 isolates path 7 isolates query 9 isolates fragment </pre> This was originally a state-machine; it turned out to be a lot faster (~40%) when unwound like this instead. ***********************************************************************/ final Uri parse (const(char)[] uri, bool relative = false) { char c; size_t i, mark; auto prefix = path_; auto len = uri.length; if (! relative) reset(); // isolate scheme (note that it's OK to not specify a scheme) for (i=0; i < len && !(map[c = uri[i]] & ExcScheme); ++i) {} if (c is ':') { /* Bad dup? */ auto lower_uri = uri [mark .. i].dup; toLower (lower_uri); scheme_ = lower_uri; mark = i + 1; } // isolate authority if (mark < len-1 && uri[mark] is '/' && uri[mark+1] is '/') { for (mark+=2, i=mark; i < len && !(map[uri[i]] & ExcAuthority); ++i) {} parseAuthority (uri[mark .. i]); mark = i; } else if (relative) { auto head = (uri[0] is '/') ? host_ : toLastSlash(prefix); query_ = fragment_ = null; uri = head ~ uri; len = uri.length; mark = head.length; } // isolate path for (i=mark; i < len && !(map[uri[i]] & ExcPath); ++i) {} path_ = decoder (uri[mark .. i]); mark = i; // isolate query if (mark < len && uri[mark] is '?') { for (++mark, i=mark; i < len && uri[i] != '#'; ++i) {} query_ = decoder (uri[mark .. i], '&'); mark = i; } // isolate fragment if (mark < len && uri[mark] is '#') fragment_ = decoder (uri[mark+1 .. len]); return this; } /*********************************************************************** Clear everything to null. ***********************************************************************/ final void reset() { decoded.reset(); port_ = InvalidPort; host_ = path_ = query_ = scheme_ = userinfo_ = fragment_ = null; } /*********************************************************************** Parse the given uri, with support for relative URLs ***********************************************************************/ final Uri relParse (const(char)[] uri) { return parse (uri, true); } /*********************************************************************** Set the Uri scheme ***********************************************************************/ @property final Uri scheme (const(char)[] scheme) { this.scheme_ = scheme; return this; } /*********************************************************************** Set the Uri host ***********************************************************************/ @property final Uri host (const(char)[] host) { this.host_ = host; return this; } /*********************************************************************** Set the Uri port ***********************************************************************/ @property final Uri port (int port) { this.port_ = port; return this; } /*********************************************************************** Set the Uri userinfo ***********************************************************************/ @property final Uri userinfo (const(char)[] userinfo) { this.userinfo_ = userinfo; return this; } /*********************************************************************** Set the Uri query ***********************************************************************/ @property final Uri query (const(char)[] query) { this.query_ = query; return this; } /*********************************************************************** Extend the Uri query ***********************************************************************/ final const(char)[] extendQuery (const(char)[] tail) { if (tail.length) { if (query_.length) query_ = query_ ~ "&" ~ tail; else query_ = tail; } return query_; } /*********************************************************************** Set the Uri path ***********************************************************************/ @property final Uri path (const(char)[] path) { this.path_ = path; return this; } /*********************************************************************** Set the Uri fragment ***********************************************************************/ @property final Uri fragment (const(char)[] fragment) { this.fragment_ = fragment; return this; } /*********************************************************************** Authority is the section after the scheme, but before the path, query or fragment; it typically represents a host. --- ^(([^@]*)@?)([^:]*)?(:(.*))? 12 3 4 5 2 isolates userinfo 3 isolates host 5 isolates port --- ***********************************************************************/ private void parseAuthority (const(char)[] auth) { size_t mark, len = auth.length; // get userinfo: (([^@]*)@?) foreach (int i, char c; auth) if (c is '@') { userinfo_ = decoder (auth[0 .. i]); mark = i + 1; break; } // get port: (:(.*))? for (size_t i=mark; i < len; ++i) if (auth [i] is ':') { port_ = Integer.atoi (auth [i+1 .. len]); len = i; break; } // get host: ([^:]*)? host_ = auth [mark..len]; } /********************************************************************** **********************************************************************/ private final static T[] toLastSlash (T)(T[] path) { if (path.ptr) for (auto p = path.ptr+path.length; --p >= path.ptr;) if (*p is '/') return path [0 .. (p-path.ptr)+1]; return path; } /********************************************************************** in-place conversion to lowercase **********************************************************************/ private final static char[] toLower (ref char[] src) { foreach (ref char c; src) if (c >= 'A' && c <= 'Z') c = cast(char)(c + ('a' - 'A')); return src; } } /******************************************************************************* *******************************************************************************/ private struct HeapSlice { private size_t used; private void[] buffer; /*********************************************************************** Reset content length to zero ***********************************************************************/ final void reset () { used = 0; } /*********************************************************************** Potentially expand the content space, and return a pointer to the start of the empty section. ***********************************************************************/ final void* expand (size_t size) { auto len = used + size; if (len > buffer.length) buffer.length = len + len/2; return &buffer [used]; } /*********************************************************************** Return a slice of the content from the current position with the specified size. Adjusts the current position to point at an empty zone. ***********************************************************************/ final void[] slice (int size) { size_t i = used; used += size; return buffer [i..used]; } } /******************************************************************************* Unittest *******************************************************************************/ debug (UnitTest) { import tango.util.log.Trace; unittest { auto uri = new Uri; auto uristring = "http://www.example.com/click.html/c=37571:RoS_Intern_search-link3_LB_Sky_Rec/b=98983:news-time-search-link_leader_neu/l=68%7C%7C%7C%7Cde/url=http://ads.ad4max.com/adclick.aspx?id=cf722624-efd5-4b10-ad53-88a5872a8873&pubad=b9c8acc4-e396-4b0b-b665-8bb3078128e6&avid=963171985&adcpc=xrH%2f%2bxVeFaPVkbVCMufB5A%3d%3d&a1v=6972657882&a1lang=de&a1ou=http%3a%2f%2fad.search.ch%2fiframe_ad.html%3fcampaignname%3dRoS_Intern_search-link3_LB_Sky_Rec%26bannername%3dnews-time-search-link_leader_neu%26iframeid%3dsl_if1%26content%3dvZLLbsIwEEX3%2bQo3aqUW1XEgkAckSJRuKqEuoDuELD%2bmiSEJyDEE%2fr7h0cKm6q6SF9bVjH3uzI3vMOaVYdpgPIwrodXGIHPYQGIb2BuyZDt2Vu2hRUh8Nx%2b%2fjj5Gc4u0UNAJ95H7jCbAJGi%2bZlqix3eoqyfUIhaT3YLtabpVEiXI5pEImRBdDF7k4y53Oea%2b38Mh554bhO1OCL49%2bO6qlTTZsa3546pmoNLMHOXIvaoapNIgTnpmzKZPCJNOBUyLzBEZEbkSKyczRU5E4gW9oN2frmf0rTSgS3quw7kqVx6dvNDZ6kCnIAhPojAKvX7Z%2bMFGFYBvKml%2bskxL2JI88cOHYPxzJJCtzpP79pXQaCZWqkxppcVvlDsF9b9CqiJNLiB1Xd%2bQqIKlUBHXSdWnjQbN1heLoRWTcwz%2bCAlqLCZXg5VzHoEj1gW5XJeVffOcFR8TCKVs8vcF%26crc%3dac8cc2fa9ec2e2de9d242345c2d40c25"; uri.parse(uristring); with(uri) { assert(scheme == "http"); assert(host == "www.example.com"); assert(port == InvalidPort); assert(userinfo == null); assert(fragment == null); assert(path == "/click.html/c=37571:RoS_Intern_search-link3_LB_Sky_Rec/b=98983:news-time-search-link_leader_neu/l=68||||de/url=http://ads.ad4max.com/adclick.aspx"); assert(query == "id=cf722624-efd5-4b10-ad53-88a5872a8873&pubad=b9c8acc4-e396-4b0b-b665-8bb3078128e6&avid=963171985&adcpc=xrH/+xVeFaPVkbVCMufB5A==&a1v=6972657882&a1lang=de&a1ou=http://ad.search.ch/iframe_ad.html?campaignname=RoS_Intern_search-link3_LB_Sky_Rec%26bannername=news-time-search-link_leader_neu%26iframeid=sl_if1%26content=vZLLbsIwEEX3+Qo3aqUW1XEgkAckSJRuKqEuoDuELD+miSEJyDEE/r7h0cKm6q6SF9bVjH3uzI3vMOaVYdpgPIwrodXGIHPYQGIb2BuyZDt2Vu2hRUh8Nx+/jj5Gc4u0UNAJ95H7jCbAJGi+Zlqix3eoqyfUIhaT3YLtabpVEiXI5pEImRBdDF7k4y53Oea+38Mh554bhO1OCL49+O6qlTTZsa3546pmoNLMHOXIvaoapNIgTnpmzKZPCJNOBUyLzBEZEbkSKyczRU5E4gW9oN2frmf0rTSgS3quw7kqVx6dvNDZ6kCnIAhPojAKvX7Z+MFGFYBvKml+skxL2JI88cOHYPxzJJCtzpP79pXQaCZWqkxppcVvlDsF9b9CqiJNLiB1Xd+QqIKlUBHXSdWnjQbN1heLoRWTcwz+CAlqLCZXg5VzHoEj1gW5XJeVffOcFR8TCKVs8vcF%26crc=ac8cc2fa9ec2e2de9d242345c2d40c25"); parse("psyc://example.net/~marenz?what#_presence"); assert(scheme == "psyc"); assert(host == "example.net"); assert(port == InvalidPort); assert(fragment == "_presence"); assert(path == "/~marenz"); assert(query == "what"); } //Cout (uri).newline; //Cout (uri.encode ("&#$%", uri.IncQuery)).newline; } } |