| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758 | /******************************************************************************* Copyright: Copyright (C) 2007 Aaron Craelius and Kris Bell All rights reserved. License: BSD style: $(LICENSE) version: Initial release: February 2008 Authors: Aaron, Kris *******************************************************************************/ module tango.text.xml.PullParser; private import tango.text.Util : indexOf; private import tango.core.Exception : XmlException; private import Integer = tango.text.convert.Integer; private import Utf = tango.text.convert.Utf : toString; /******************************************************************************* Use -version=whitespace to retain whitespace as data nodes. We see a %25 increase in token count and 10% throughput drop when parsing "hamlet.xml" with this option enabled (pullparser alone) *******************************************************************************/ version (whitespace) version = retainwhite; else { version = stripwhite; version = partialwhite; } /******************************************************************************* The XML node types *******************************************************************************/ public enum XmlNodeType {Element, Data, Attribute, CData, Comment, PI, Doctype, Document}; /******************************************************************************* Values returned by the pull-parser *******************************************************************************/ public enum XmlTokenType {Done, StartElement, Attribute, EndElement, EndEmptyElement, Data, Comment, CData, Doctype, PI, None}; /******************************************************************************* Token based xml Parser. Templated to operate with char[], wchar[], and dchar[] content. The parser is constructed with some tradeoffs relating to document integrity. It is generally optimized for well-formed documents, and currently may read past a document-end for those that are not well formed. There are various compilation options to enable checks and balances, depending on how things should be handled. We'll settle on a common configuration over the next few weeks, but for now all settings are somewhat experimental. Partly because making some tiny unrelated change to the code can cause notable throughput changes, and we need to track that down. We're not yet clear why these swings are so pronounced (for changes outside the code path) but they seem to be related to the alignment of codegen. It could be a cache-line issue, or something else. We'll figure it out, yet it's interesting that some hardware buttons are clearly being pushed *******************************************************************************/ class PullParser(Ch = char) { public int depth; public const(Ch)[] prefix; public const(Ch)[] rawValue; public const(Ch)[] localName; public XmlTokenType type = XmlTokenType.None; package XmlText!(Ch) text; private bool stream; private const(char)[] errMsg; /*********************************************************************** Construct a parser on the given content (may be null) ***********************************************************************/ this(const(Ch[]) content = null) { reset (content); } /*********************************************************************** Consume the next token and return its type ***********************************************************************/ @property final XmlTokenType next() { auto e = text.end; auto p = text.point; // at end of document? if (p >= e) return endOfInput(); version (stripwhite) { // strip leading whitespace while (*p <= 32) if (++p >= e) return endOfInput(); } // StartElement or Attribute? if (type < XmlTokenType.EndElement) { version (retainwhite) { // strip leading whitespace (thanks to DRK) while (*p <= 32) if (++p >= e) return endOfInput(); } switch (*p) { case '>': // termination of StartElement ++depth; ++p; break; case '/': // empty element closure text.point = p; return doEndEmptyElement(); default: // must be attribute instead text.point = p; return doAttributeName(); } } // consume data between elements? if (*p != '<') { auto q = p; while (++p < e && *p != '<') {} if (p < e) { version (partialwhite) { // include leading whitespace while (*(q-1) <= 32) --q; } text.point = p; rawValue = q [0 .. p - q]; return type = XmlTokenType.Data; } return endOfInput(); } // must be a '<' character, so peek ahead switch (p[1]) { case '!': // one of the following ... if (p[2..4] == "--") { text.point = p + 4; return doComment(); } else if (p[2..9] == "[CDATA[") { text.point = p + 9; return doCData(); } else if (p[2..9] == "DOCTYPE") { text.point = p + 9; return doDoctype(); } return doUnexpected("!", p); case '\?': // must be PI data text.point = p + 2; return doPI(); case '/': // should be a closing element name p += 2; auto q = p; while (*q > 63 || text.name[*q]) ++q; if (*q is ':') { prefix = p[0 .. q - p]; p = ++q; while (*q > 63 || text.attributeName[*q]) ++q; localName = p[0 .. q - p]; } else { prefix = null; localName = p[0 .. q - p]; } while (*q <= 32) if (++q >= e) return endOfInput(); if (*q is '>') { --depth; text.point = q + 1; return type = XmlTokenType.EndElement; } return doExpected(">", q); default: // scan new element name auto q = ++p; while (*q > 63 || text.name[*q]) ++q; // check if we ran past the end if (q >= e) return endOfInput(); if (*q != ':') { prefix = null; localName = p [0 .. q - p]; } else { prefix = p[0 .. q - p]; p = ++q; while (*q > 63 || text.attributeName[*q]) ++q; localName = p[0 .. q - p]; } text.point = q; return type = XmlTokenType.StartElement; } } /*********************************************************************** ***********************************************************************/ private XmlTokenType doAttributeName() { auto p = text.point; auto q = p; auto e = text.end; while (*q > 63 || text.attributeName[*q]) ++q; if (q >= e) return endOfInput(); if (*q is ':') { prefix = p[0 .. q - p]; p = ++q; while (*q > 63 || text.attributeName[*q]) ++q; localName = p[0 .. q - p]; } else { prefix = null; localName = p[0 .. q - p]; } if (*q <= 32) { while (*++q <= 32) {} if (q >= e) return endOfInput(); } if (*q is '=') { while (*++q <= 32) {} if (q >= e) return endOfInput(); auto quote = *q; switch (quote) { case '"': case '\'': p = q + 1; while (*++q != quote) {} if (q < e) { rawValue = p[0 .. q - p]; text.point = q + 1; // skip end quote return type = XmlTokenType.Attribute; } return endOfInput(); default: return doExpected("\' or \"", q); } } return doExpected ("=", q); } /*********************************************************************** ***********************************************************************/ private XmlTokenType doEndEmptyElement() { if (text.point[0] is '/' && text.point[1] is '>') { localName = prefix = null; text.point += 2; return type = XmlTokenType.EndEmptyElement; } return doExpected("/>", text.point); } /*********************************************************************** ***********************************************************************/ private XmlTokenType doComment() { auto e = text.end; auto p = text.point; auto q = p; while (p < e) { while (*p != '-') if (++p >= e) return endOfInput(); if (p[0..3] == "-->") { text.point = p + 3; rawValue = q [0 .. p - q]; return type = XmlTokenType.Comment; } ++p; } return endOfInput(); } /*********************************************************************** ***********************************************************************/ private XmlTokenType doCData() { auto e = text.end; auto p = text.point; while (p < e) { auto q = p; while (*p != ']') if (++p >= e) return endOfInput(); if (p[0..3] == "]]>") { text.point = p + 3; rawValue = q [0 .. p - q]; return type = XmlTokenType.CData; } ++p; } return endOfInput(); } /*********************************************************************** ***********************************************************************/ private XmlTokenType doPI() { auto e = text.end; auto p = text.point; auto q = p; while (p < e) { while (*p != '\?') if (++p >= e) return endOfInput(); if (p[1] == '>') { rawValue = q [0 .. p - q]; text.point = p + 2; return type = XmlTokenType.PI; } ++p; } return endOfInput(); } /*********************************************************************** ***********************************************************************/ private XmlTokenType doDoctype() { auto e = text.end; auto p = text.point; // strip leading whitespace while (*p <= 32) if (++p >= e) return endOfInput(); auto q = p; while (p < e) { if (*p is '>') { rawValue = q [0 .. p - q]; prefix = null; text.point = p + 1; return type = XmlTokenType.Doctype; } else { if (*p == '[') do { if (++p >= e) return endOfInput(); } while (*p != ']'); ++p; } } if (p >= e) return endOfInput(); return XmlTokenType.Doctype; } /*********************************************************************** ***********************************************************************/ private XmlTokenType endOfInput () { if (depth && (stream is false)) error ("Unexpected EOF"); return XmlTokenType.Done; } /*********************************************************************** ***********************************************************************/ private XmlTokenType doUnexpected (const(char[]) msg, const(Ch)* p) { return position ("parse error :: unexpected " ~ msg, p); } /*********************************************************************** ***********************************************************************/ private XmlTokenType doExpected (const(char[]) msg, const(Ch)* p) { char[6] tmp = void; return position ("parse error :: expected " ~ msg ~ " instead of " ~ Utf.toString(p[0..1], tmp), p); } /*********************************************************************** ***********************************************************************/ private XmlTokenType position (const(char[]) msg, const(Ch)* p) { return error (msg ~ " at position " ~ Integer.toString(p-text.text.ptr)); } /*********************************************************************** ***********************************************************************/ @property protected final XmlTokenType error (const(char[]) msg) { errMsg = msg; throw new XmlException (msg.idup); } /*********************************************************************** Return the raw value of the current token ***********************************************************************/ @property final const const(Ch[]) value() { return rawValue; } /*********************************************************************** Return the name of the current token ***********************************************************************/ @property final const const(Ch[]) name() { if (prefix.length) return prefix ~ ":" ~ localName; return localName; } /*********************************************************************** Returns the text of the last error ***********************************************************************/ @property final const const(char[]) error() { return errMsg; } /*********************************************************************** Reset the parser ***********************************************************************/ final bool reset() { text.reset (text.text); reset_(); return true; } /*********************************************************************** Reset parser with new content ***********************************************************************/ final void reset(const(Ch[]) newText) { text.reset (newText); reset_(); } /*********************************************************************** experimental: set streaming mode Use at your own risk, may be removed. ***********************************************************************/ final void incremental (bool yes = true) { stream = yes; } /*********************************************************************** ***********************************************************************/ private void reset_() { depth = 0; errMsg = null; type = XmlTokenType.None; auto p = text.point; if (p) { static if (Ch.sizeof == 1) { // consume UTF8 BOM if (p[0] is 0xef && p[1] is 0xbb && p[2] is 0xbf) p += 3; } //TODO enable optional declaration parsing auto e = text.end; while (p < e && *p <= 32) ++p; if (p < e) if (p[0] is '<' && p[1] is '\?' && p[2..5] == "xml") { p += 5; while (p < e && *p != '\?') ++p; p += 2; } text.point = p; } } } /******************************************************************************* *******************************************************************************/ package struct XmlText(Ch) { package const(Ch)* end; package size_t len; package const(Ch)[] text; package const(Ch)* point; final void reset(const(Ch[]) newText) { this.text = newText; this.len = newText.length; this.point = text.ptr; this.end = point + len; } static enum ubyte name[64] = [ // 0 1 2 3 4 5 6 7 8 9 A B C D E F 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 2 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0 // 3 ]; static enum ubyte attributeName[64] = [ // 0 1 2 3 4 5 6 7 8 9 A B C D E F 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 2 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0 // 3 ]; } /******************************************************************************* *******************************************************************************/ debug (UnitTest) { /*********************************************************************** ***********************************************************************/ void testParser(Ch)(PullParser!(Ch) itr) { /* assert(itr.next); assert(itr.value == ""); assert(itr.type == XmlTokenType.Declaration, Integer.toString(itr.type)); assert(itr.next); assert(itr.value == "version"); assert(itr.next); assert(itr.value == "1.0");*/ assert(itr.next); assert(itr.value == "element [ <!ELEMENT element (#PCDATA)>]"); assert(itr.type == XmlTokenType.Doctype); assert(itr.next); assert(itr.localName == "element"); assert(itr.type == XmlTokenType.StartElement); assert(itr.depth == 0); assert(itr.next); assert(itr.localName == "attr"); assert(itr.value == "1"); assert(itr.next); assert(itr.type == XmlTokenType.Attribute); assert(itr.localName == "attr2"); assert(itr.value == "two"); assert(itr.next); assert(itr.value == "comment"); assert(itr.next); assert(itr.rawValue == "test&Z"); assert(itr.next); assert(itr.prefix == "qual"); assert(itr.localName == "elem"); assert(itr.next); assert(itr.type == XmlTokenType.EndEmptyElement); assert(itr.next); assert(itr.localName == "el2"); assert(itr.depth == 1); assert(itr.next); assert(itr.localName == "attr3"); assert(itr.value == "3three", itr.value); assert(itr.next); assert(itr.rawValue == "sdlgjsh"); assert(itr.next); assert(itr.localName == "el3"); assert(itr.depth == 2); assert(itr.next); assert(itr.type == XmlTokenType.EndEmptyElement); assert(itr.next); assert(itr.value == "data"); assert(itr.next); // assert(itr.qvalue == "pi", itr.qvalue); // assert(itr.value == "test"); assert(itr.rawValue == "pi test", itr.rawValue); assert(itr.next); assert(itr.localName == "el2"); assert(itr.next); assert(itr.localName == "element"); assert(!itr.next); } /*********************************************************************** ***********************************************************************/ enum immutable(char)[] testXML = "<?xml version=\"1.0\" ?><!DOCTYPE element [ <!ELEMENT element (#PCDATA)>]><element " "attr=\"1\" attr2=\"two\"><!--comment-->test&Z<qual:elem /><el2 attr3 = " "'3three'><![CDATA[sdlgjsh]]><el3 />data<?pi test?></el2></element>"; unittest { auto itr = new PullParser!(char)(testXML); testParser (itr); } } |