Jonathan M Davis: The Long-Winded D Guy

dxml.parser

This implements a range-based StAX parser for XML 1.0 (which will work with XML 1.1 documents assuming that they don't use any 1.1-specific features). For the sake of simplicity, sanity, and efficiency, the DTD section is not supported beyond what is required to parse past it.
Start tags, end tags, comments, cdata sections, and processing instructions are all supported and reported to the application. Anything in the DTD is skipped (though it's parsed enough to parse past it correctly, and that can result in an XMLParsingException if that XML isn't valid enough to be correctly skipped), and the XML declaration at the top is skipped if present (XML 1.1 requires that it be there, but XML 1.0 does not).
Regardless of what the XML declaration says (if present), any range of char will be treated as being encoded in UTF-8, any range of wchar will be treated as being encoded in UTF-16, and any range of dchar will be treated as having been encoded in UTF-32. Strings will be treated as ranges of their code units, not code points. Note that like Phobos typically does when processing strings, the code assumes that BOMs have already been removed, so if the range of characters comes from a file that uses a BOM, the calling code needs to strip it out before calling parseXML, or parsing will fail due to invalid characters.
Since the DTD is skipped, entity references other than the five which are predefined by the XML spec cannot be fully processed (since wherever they were used in the document would be replaced by what they referred to, which could be arbitrarily complex XML). As such, by default, if any entity references which are not predefined are encountered outside of the DTD, an XMLParsingException will be thrown (see Config.throwOnEntityRef for how that can be configured). The predefined entity references and any character references encountered will be checked to verify that they're valid, but they will not be replaced (since that does not work with returning slices of the original input).
However, decodeXML or parseStdEntityRef from dxml.util can be used to convert the predefined entity references to what the refer to, and decodeXML or parseCharRef from dxml.util can be used to convert character references to what they refer to.

Primary Symbols

Symbol Description
parseXML The function used to initiate the parsing of an XML document.
EntityRange The range returned by parseXML.
EntityRange.Entity The element type of EntityRange.

Parser Configuration Helpers

Symbol Description
Config Used to configure how EntityRange parses the XML.
simpleXML A user-friendly configuration for when the application just wants the element tags and the data in between them.
makeConfig A convenience function for constructing a custom Config.
SkipComments A std.typecons.Flag used with Config to tell the parser to skip comments.
SkipPI A std.typecons.Flag used with Config to tell the parser to skip processing instructions.
SplitEmpty A std.typecons.Flag used with Config to configure how the parser deals with empty element tags.

Helper Types Used When Parsing

Symbol Description
EntityType The type of an entity in the XML (e.g. a start tag or a comment).
TextPos Gives the line and column number in the XML document.
XMLParsingException Thrown by EntityRange when it encounters invalid XML.

Helper Functions Used When Parsing

Symbol Description
getAttrs A function similar to std.getopt.getopt which allows for the easy processing of start tag attributes.
skipContents Iterates an EntityRange from a start tag to its matching end tag.
skipToPath Used to navigate from one start tag to another as if the start tag names formed a file path.
skipToEntityType Skips to the next entity of the given type in the range.
skipToParentEndTag Iterates an EntityRange until it reaches the end tag that matches the start tag which is the parent of the current entity.

Helper Traits

Symbol Description
isAttrRange Whether the given range is a range of attributes.
Examples:
auto xml = "<!-- comment -->\n" ~
           "<root>\n" ~
           "    <foo>some text<whatever/></foo>\n" ~
           "    <bar/>\n" ~
           "    <baz></baz>\n" ~
           "</root>";
{
    auto range = parseXML(xml);
    assert(range.front.type == EntityType.comment);
    assert(range.front.text == " comment ");
    range.popFront();

    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "root");
    range.popFront();

    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "foo");
    range.popFront();

    assert(range.front.type == EntityType.text);
    assert(range.front.text == "some text");
    range.popFront();

    assert(range.front.type == EntityType.elementEmpty);
    assert(range.front.name == "whatever");
    range.popFront();

    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "foo");
    range.popFront();

    assert(range.front.type == EntityType.elementEmpty);
    assert(range.front.name == "bar");
    range.popFront();

    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "baz");
    range.popFront();

    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "baz");
    range.popFront();

    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "root");
    range.popFront();

    assert(range.empty);
}
{
    auto range = parseXML!simpleXML(xml);

    // simpleXML skips comments

    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "root");
    range.popFront();

    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "foo");
    range.popFront();

    assert(range.front.type == EntityType.text);
    assert(range.front.text == "some text");
    range.popFront();

    // simpleXML splits empty element tags into a start tag and end tag
    // so that the code doesn't have to care whether a start tag with no
    // content is an empty tag or a start tag and end tag with nothing but
    // whitespace in between.
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "whatever");
    range.popFront();

    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "whatever");
    range.popFront();

    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "foo");
    range.popFront();

    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "bar");
    range.popFront();

    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "bar");
    range.popFront();

    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "baz");
    range.popFront();

    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "baz");
    range.popFront();

    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "root");
    range.popFront();

    assert(range.empty);
}
class XMLParsingException: object.Exception;
The exception type thrown when the XML parser encounters invalid XML.
TextPos pos;
The position in the XML input where the problem is.
struct TextPos;
Where in the XML document an entity is.
The line and column numbers are 1-based.
The primary use case for TextPos is XMLParsingException, but an application may have other uses for it. The TextPos for an Entity can be obtained from Entity.pos.
int line;
A line number in the XML file.
int col;
A column number in a line of the XML file.
Each code unit is considered a column, so depending on what a program is looking to do with the column number, it may need to examine the actual text on that line and calculate the number that represents what the program wants to display (e.g. the number of graphemes).
struct Config;
Used to configure how the parser works.
Flag skipComments;
Whether the comments should be skipped while parsing.
If skipComments == SkipComments.yes, any entities of type EntityType.comment will be omitted from the parsing results, and they will not be validated beyond what is required to parse past them.
Defaults to SkipComments.no.
Flag skipPI;
Whether processing instructions should be skipped.
If skipPI == SkipPI.yes, any entities of type EntityType.pi will be skipped, and they will not be validated beyond what is required to parse past them.
Defaults to SkipPI.no.
Flag splitEmpty;
Whether the parser should report empty element tags as if they were a start tag followed by an end tag with nothing in between.
If splitEmpty == SplitEmpty.yes, then whenever an EntityType.elementEmpty is encountered, the parser will claim that that entity is an EntityType.elementStart, and then it will provide an EntityType.elementEnd as the next entity before the entity that actually follows it.
The purpose of this is to simplify the code using the parser, since most code does not care about the difference between an empty tag and a start and end tag with nothing in between. But since some code may care about the difference, the behavior is configurable.
Defaults to SplitEmpty.no.
Examples:
enum configSplitYes = makeConfig(SplitEmpty.yes);

{
    auto range = parseXML("<root></root>");
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "root");
    range.popFront();
    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "root");
    range.popFront();
    assert(range.empty);
}
{
    // No difference if the tags are already split.
    auto range = parseXML!configSplitYes("<root></root>");
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "root");
    range.popFront();
    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "root");
    range.popFront();
    assert(range.empty);
}
{
    // This treats <root></root> and <root/> as distinct.
    auto range = parseXML("<root/>");
    assert(range.front.type == EntityType.elementEmpty);
    assert(range.front.name == "root");
    range.popFront();
    assert(range.empty);
}
{
    // This is parsed as if it were <root></root> insead of <root/>.
    auto range = parseXML!configSplitYes("<root/>");
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "root");
    range.popFront();
    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "root");
    range.popFront();
    assert(range.empty);
}
Flag throwOnEntityRef;
Whether the parser should throw when it encounters any entity references other than the five entity references defined in the XML standard.
Any other entity references would have to be defined in the DTD in order to be valid. And in order to know what XML they represent (which could be arbitrarily complex, even effectively inserting entire XML documents into the middle of the XML), the DTD would have to be parsed. However, dxml does not support parsing the DTD beyond what is required to correctly parse past it, and replacing entity references with what they represent would not work with the slicing semantics that EntityRange provides. As such, it is not possible for dxml to correctly handle any entity references other than the five which are defined in the XML standard, and even those are only parsed by using dxml.util.decodeXML or dxml.util.parseStdEntityRef. EntityRange always validates that entity references are one of the five, predefined entity references, but otherwise, it lets them pass through as normal text. It does not replace them with what they represent.
As such, the default behavior of EntityRange is to throw an XMLParsingException when it encounters an entity reference which is not one of the five defined by the XML standard. With that behavior, there is no risk of processing an XML document as if it had no entity references and ending up with what the program using the parser would probably consider incorrect results. However, there are cases where a program may find it acceptable to treat entity references as normal text and ignore them. As such, if a program wishes to take that approach, it can set throwOnEntityRef to ThrowOnEntityRef.no.
If throwOnEntityRef == ThrowOnEntityRef.no, then any entity reference that it encounters will be validated to ensure that it is syntactically valid (i.e. that the characters it contains form what could be a valid entity reference assuming that the DTD declared it properly), but otherwise, EntityRange will treat it as normal text, just like it treats the five, predefined entity references as normal text.
Note that any valid XML entity reference which contains start or end tags must contain matching start or end tags, and entity references cannot contain incomplete fragments of XML (e.g. the start or end of a comment). So, missing entity references should only affect the data in the XML document and not its overall structure (if that were not true, attempting to ignore entity references such as ThrowOnEntityRef.no does would be a disaster in the making). However, how reasonable it is to miss that data depends entirely on the application and what the XML documents it's parsing contain - hence, the behavior is configurable.
Examples:
import std.exception : assertThrown;
import dxml.util : decodeXML;

auto xml = "<root>\n" ~
           "    <std>&amp;&apos;&gt;&lt;&quot;</std>\n" ~
           "    <other>&foobar;</other>\n" ~
           "    <invalid>&--;</invalid>\n" ~
           "</root>";

// ThrowOnEntityRef.yes
{
    auto range = parseXML(xml);
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "root");

    range.popFront();
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "std");

    range.popFront();
    assert(range.front.type == EntityType.text);
    assert(range.front.text == "&amp;&apos;&gt;&lt;&quot;");
    assert(range.front.text.decodeXML() == `&'><"`);

    range.popFront();
    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "std");

    range.popFront();
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "other");

    // Attempted to parse past "&foobar;", which is syntactically
    // valid, but it's not one of the five predefined entity references.
    assertThrown!XMLParsingException(range.popFront());
}

// ThrowOnEntityRef.no
{
    auto range = parseXML!(makeConfig(ThrowOnEntityRef.no))(xml);
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "root");

    range.popFront();
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "std");

    range.popFront();
    assert(range.front.type == EntityType.text);
    assert(range.front.text == "&amp;&apos;&gt;&lt;&quot;");
    assert(range.front.text.decodeXML() == `&'><"`);

    range.popFront();
    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "std");

    range.popFront();
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "other");

    // Doesn't throw, because "&foobar;" is syntactically valid.
    range.popFront();
    assert(range.front.type == EntityType.text);
    assert(range.front.text == "&foobar;");

    // decodeXML has no effect on non-standard entity references.
    assert(range.front.text.decodeXML() == "&foobar;");

    range.popFront();
    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "other");

    range.popFront();
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "invalid");

    // Attempted to parse past "&--;", which is not syntactically valid,
    // because -- is not a valid name for an entity reference.
    assertThrown!XMLParsingException(range.popFront());
}
alias SkipComments = std.typecons.Flag!"SkipComments".Flag;
See Also: skipComments
alias SkipPI = std.typecons.Flag!"SkipPI".Flag;
See Also: skipPI
alias SplitEmpty = std.typecons.Flag!"SplitEmpty".Flag;
See Also: splitEmpty
alias ThrowOnEntityRef = std.typecons.Flag!"ThrowOnEntityRef".Flag;
Config makeConfig(Args...)(Args args);
Helper function for creating a custom config. It makes it easy to set one or more of the member variables to something other than the default without having to worry about explicitly setting them individually or setting them all at once via a constructor.
The order of the arguments does not matter. The types of each of the members of Config are unique, so that information alone is sufficient to determine which argument should be assigned to which member.
Examples:
{
    auto config = makeConfig(SkipComments.yes);
    assert(config.skipComments == SkipComments.yes);
    assert(config.skipPI == Config.init.skipPI);
    assert(config.splitEmpty == Config.init.splitEmpty);
    assert(config.throwOnEntityRef == Config.init.throwOnEntityRef);
}
{
    auto config = makeConfig(SkipComments.yes, SkipPI.yes);
    assert(config.skipComments == SkipComments.yes);
    assert(config.skipPI == SkipPI.yes);
    assert(config.splitEmpty == Config.init.splitEmpty);
    assert(config.throwOnEntityRef == Config.init.throwOnEntityRef);
}
{
    auto config = makeConfig(SplitEmpty.yes, SkipComments.yes, ThrowOnEntityRef.no);
    assert(config.skipComments == SkipComments.yes);
    assert(config.skipPI == Config.init.skipPI);
    assert(config.splitEmpty == SplitEmpty.yes);
    assert(config.throwOnEntityRef == ThrowOnEntityRef.no);
}
enum Config simpleXML;
This Config is intended for making it easy to parse XML by skipping everything that isn't the actual data as well as making it simpler to deal with empty element tags by treating them the same as a start tag and end tag with nothing but whitespace between them.
Examples:
static assert(simpleXML.skipComments == SkipComments.yes);
static assert(simpleXML.skipPI == SkipPI.yes);
static assert(simpleXML.splitEmpty == SplitEmpty.yes);
static assert(simpleXML.throwOnEntityRef == ThrowOnEntityRef.yes);
enum EntityType: int;
Represents the type of an XML entity. Used by EntityRange.Entity.
cdata
A cdata section: <![CDATA[ ... ]]>.
comment
An XML comment: <!-- ... -->.
elementStart
The start tag for an element. e.g. <foo name="value">.
elementEnd
The end tag for an element. e.g. </foo>.
elementEmpty
The tag for an element with no contents or matching end tag. e.g. <foo name="value"/>.
pi
A processing instruction such as <?foo?>. Note that the <?xml ... ?> is skipped and not treated as an EntityType.pi.
text
The content of an element tag that is simple text.
If there is an entity other than the end tag following the text, then the text includes up to that entity.
Note however that character references (e.g. "&#42") and the predefined entity references (e.g. "&apos;") are left unprocessed in the text. In order for them to be processed, the text should be passed to either decodeXML or asDecodedXML. Entity references which are not predefined are considered invalid XML, because the DTD section is skipped, and thus they cannot be processed properly.
struct EntityRange(Config cfg, R) if (isForwardRange!R && isSomeChar!(ElementType!R));
EntityRange!(config, R) parseXML(Config config = Config.init, R)(R xmlText)
if(isForwardRange!R && isSomeChar!(ElementType!R));
Lazily parses the given range of characters as an XML document.
EntityRange is essentially a StAX parser, though it evolved into that rather than being based on what Java did, and it's range-based rather than iterator-based, so its API is likely to differ from other implementations. The basic concept should be the same though.
One of the core design goals of this parser is to slice the original input rather than having to allocate strings for the output or wrap it in a lazy range that produces a mutated version of the data. So, all of the text that the parser provides is either a slice or std.range.takeExactly of the input. However, in some cases, for the parser to be fully compliant with the XML spec, dxml.util.decodeXML must be called on the text to mutate certain constructs (e.g. removing any '\r' in the text or converting "&lt;" to '<'). But that's left up to the application.
The parser is not @nogc, but it allocates memory very minimally. It allocates some of its state on the heap so it can validate attributes and end tags. However, that state is shared among all the ranges that came from the same call to parseXML (only the range farthest along in parsing validates attributes or end tags), so save does not allocate memory unless save on the underlying range allocates memory. The shared state currently uses a couple of dynamic arrays to validate the tags and attributes, and if the document has a particularly deep tag depth or has a lot of attributes on a start tag, then some reallocations may occur until the maximum is reached, but enough is reserved that for most documents, no reallocations will occur. The only other times that the parser would allocate would be if an exception were thrown or if the range that was passed to parseXML allocates for any reason when calling any of the range primitives.
If invalid XML is encountered at any point during the parsing process, an XMLParsingException will be thrown. If an exception has been thrown, then the parser is in an invalid state, and it is an error to call any functions on it.
However, note that XML validation is reduced for any entities that are skipped (e.g. for anything in the DTD, validation is reduced to what is required to correctly parse past it, and when Config.skipPI == SkipPI.yes, processing instructions are only validated enough to correctly skip past them).
As the module documentation says, this parser does not provide any DTD support. It is not possible to properly support the DTD while returning slices of the original input, and the DTD portion of the spec makes parsing XML far, far more complicated.
A quick note about carriage returns: per the XML spec, they are all supposed to either be stripped out or replaced with newlines or spaces before the XML parser even processes the text. That doesn't work when the parser is slicing the original text and not mutating it at all. So, for the purposes of parsing, this parser treats all carriage returns as if they were newlines or spaces (though they won't count as newlines when counting the lines for TextPos). However, they will appear in any text fields or attribute values if they are in the document (since the text fields and attribute values are slices of the original text). dxml.util.decodeXML can be used to strip them along with converting any character references in the text. Alternatively, the application can remove them all before calling parseXML, but it's not necessary.
Examples:
import std.range.primitives : walkLength;

auto xml = "<?xml version='1.0'?>\n" ~
           "<?instruction start?>\n" ~
           "<foo attr='42'>\n" ~
           "    <bar/>\n" ~
           "    <!-- no comment -->\n" ~
           "    <baz hello='world'>\n" ~
           "    nothing to say.\n" ~
           "    nothing at all...\n" ~
           "    </baz>\n" ~
           "</foo>\n" ~
           "<?some foo?>";

{
    auto range = parseXML(xml);
    assert(range.front.type == EntityType.pi);
    assert(range.front.name == "instruction");
    assert(range.front.text == "start");

    range.popFront();
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "foo");

    {
        auto attrs = range.front.attributes;
        assert(walkLength(attrs.save) == 1);
        assert(attrs.front.name == "attr");
        assert(attrs.front.value == "42");
    }

    range.popFront();
    assert(range.front.type == EntityType.elementEmpty);
    assert(range.front.name == "bar");

    range.popFront();
    assert(range.front.type == EntityType.comment);
    assert(range.front.text == " no comment ");

    range.popFront();
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "baz");

    {
        auto attrs = range.front.attributes;
        assert(walkLength(attrs.save) == 1);
        assert(attrs.front.name == "hello");
        assert(attrs.front.value == "world");
    }

    range.popFront();
    assert(range.front.type == EntityType.text);
    assert(range.front.text ==
           "\n    nothing to say.\n    nothing at all...\n    ");

    range.popFront();
    assert(range.front.type == EntityType.elementEnd); // </baz>
    range.popFront();
    assert(range.front.type == EntityType.elementEnd); // </foo>

    range.popFront();
    assert(range.front.type == EntityType.pi);
    assert(range.front.name == "some");
    assert(range.front.text == "foo");

    range.popFront();
    assert(range.empty);
}
{
    auto range = parseXML!simpleXML(xml);

    // simpleXML is set to skip processing instructions.

    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "foo");

    {
        auto attrs = range.front.attributes;
        assert(walkLength(attrs.save) == 1);
        assert(attrs.front.name == "attr");
        assert(attrs.front.value == "42");
    }

    // simpleXML is set to split empty tags so that <bar/> is treated
    // as the same as <bar></bar> so that code does not have to
    // explicitly handle empty tags.
    range.popFront();
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "bar");
    range.popFront();
    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "bar");

    // simpleXML is set to skip comments.

    range.popFront();
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "baz");

    {
        auto attrs = range.front.attributes;
        assert(walkLength(attrs.save) == 1);
        assert(attrs.front.name == "hello");
        assert(attrs.front.value == "world");
    }

    range.popFront();
    assert(range.front.type == EntityType.text);
    assert(range.front.text ==
           "\n    nothing to say.\n    nothing at all...\n    ");

    range.popFront();
    assert(range.front.type == EntityType.elementEnd); // </baz>
    range.popFront();
    assert(range.front.type == EntityType.elementEnd); // </foo>
    range.popFront();
    assert(range.empty);
}
alias config = cfg;
The Config used for when parsing the XML.
alias Input = R;
The type of the range that EntityRange is parsing.
alias SliceOfR = R;
The type used when any slice of the original input is used. If R is a string or supports slicing, then SliceOfR is the same as R; otherwise, it's the result of calling std.range.takeExactly on the input.
import std.algorithm : filter;
import std.range : takeExactly;

static assert(is(EntityRange!(Config.init, string).SliceOfR == string));

auto range = filter!(a => true)("some xml");

static assert(is(EntityRange!(Config.init, typeof(range)).SliceOfR ==
                 typeof(takeExactly(range, 42))));
struct Entity;
Represents an entity in the XML document.
Note that the type determines which properties can be used, and it can determine whether functions which an Entity or EntityRange is passed to are allowed to be called. Each function lists which EntityTypes are allowed, and it is an error to call them with any other EntityType.
alias Attribute = Tuple!(SliceOfR, "name", SliceOfR, "value", TextPos, "pos");
The exact instantiation of std.typecons.Tuple that attributes returns a range of.
See Also: attributes
const pure nothrow @nogc @property @safe EntityType type();
The EntityType for this Entity.
Examples:
auto xml = "<root>\n" ~
           "    <!--no comment-->\n" ~
           "    <![CDATA[cdata run]]>\n" ~
           "    <text>I am text!</text>\n" ~
           "    <empty/>\n" ~
           "    <?pi?>\n" ~
           "</root>";

auto range = parseXML(xml);
assert(range.front.type == EntityType.elementStart);
assert(range.front.name == "root");
range.popFront();

assert(range.front.type == EntityType.comment);
assert(range.front.text == "no comment");
range.popFront();

assert(range.front.type == EntityType.cdata);
assert(range.front.text == "cdata run");
range.popFront();

assert(range.front.type == EntityType.elementStart);
assert(range.front.name == "text");
range.popFront();

assert(range.front.type == EntityType.text);
assert(range.front.text == "I am text!");
range.popFront();

assert(range.front.type == EntityType.elementEnd);
assert(range.front.name == "text");
range.popFront();

assert(range.front.type == EntityType.elementEmpty);
assert(range.front.name == "empty");
range.popFront();

assert(range.front.type == EntityType.pi);
assert(range.front.name == "pi");
range.popFront();

assert(range.front.type == EntityType.elementEnd);
assert(range.front.name == "root");
range.popFront();

assert(range.empty);
const pure nothrow @nogc @property @safe TextPos pos();
The position in the the original text where the entity starts.
Examples:
auto xml = "<root>\n" ~
           "    <foo>\n" ~
           "        Foo and bar. Always foo and bar...\n" ~
           "    </foo>\n" ~
           "</root>";

auto range = parseXML(xml);
assert(range.front.type == EntityType.elementStart);
assert(range.front.name == "root");
assert(range.front.pos == TextPos(1, 1));
range.popFront();

assert(range.front.type == EntityType.elementStart);
assert(range.front.name == "foo");
assert(range.front.pos == TextPos(2, 5));
range.popFront();

assert(range.front.type == EntityType.text);
assert(range.front.text ==
       "\n" ~
       "        Foo and bar. Always foo and bar...\n" ~
       "    ");
assert(range.front.pos == TextPos(2, 10));
range.popFront();

assert(range.front.type == EntityType.elementEnd);
assert(range.front.name == "foo");
assert(range.front.pos == TextPos(4, 5));
range.popFront();

assert(range.front.type == EntityType.elementEnd);
assert(range.front.name == "root");
assert(range.front.pos == TextPos(5, 1));
range.popFront();

assert(range.empty);
@property SliceOfR name();
Gives the name of this Entity.
Note that this is the direct name in the XML for this entity and does not contain any of the names of any of the parent entities that this entity has. If an application wants the full "path" of the entity, then it will have to keep track of that itself. The parser does not do that as it would require allocating memory.
Supported EntityTypes:
elementStart
elementEnd
elementEmpty
pi
Examples:
auto xml = "<root>\n" ~
           "    <empty/>\n" ~
           "    <?pi?>\n" ~
           "</root>";

auto range = parseXML(xml);
assert(range.front.type == EntityType.elementStart);
assert(range.front.name == "root");
range.popFront();

assert(range.front.type == EntityType.elementEmpty);
assert(range.front.name == "empty");
range.popFront();

assert(range.front.type == EntityType.pi);
assert(range.front.name == "pi");
range.popFront();

assert(range.front.type == EntityType.elementEnd);
assert(range.front.name == "root");
range.popFront();

assert(range.empty);
@property auto attributes();
Returns a lazy range of attributes for a start tag where each attribute is represented as a
Tuple!( SliceOfR, "name", SliceOfR, "value", TextPos, "pos").
Examples:
import std.algorithm.comparison : equal;
import std.algorithm.iteration : filter;
{
    auto xml = "<root/>";
    auto range = parseXML(xml);
    assert(range.front.type == EntityType.elementEmpty);
    assert(range.front.attributes.empty);

    static assert(is(ElementType!(typeof(range.front.attributes)) ==
                     typeof(range).Entity.Attribute));
}
{
    auto xml = "<root a='42' q='29' w='hello'/>";
    auto range = parseXML(xml);
    assert(range.front.type == EntityType.elementEmpty);

    auto attrs = range.front.attributes;
    assert(attrs.front.name == "a");
    assert(attrs.front.value == "42");
    assert(attrs.front.pos == TextPos(1, 7));
    attrs.popFront();

    assert(attrs.front.name == "q");
    assert(attrs.front.value == "29");
    assert(attrs.front.pos == TextPos(1, 14));
    attrs.popFront();

    assert(attrs.front.name == "w");
    assert(attrs.front.value == "hello");
    assert(attrs.front.pos == TextPos(1, 21));
    attrs.popFront();

    assert(attrs.empty);
}
// Because the type of name and value is SliceOfR, == with a string
// only works if the range passed to parseXML was string.
{
    auto xml = filter!(a => true)("<root a='42' q='29' w='hello'/>");
    auto range = parseXML(xml);
    assert(range.front.type == EntityType.elementEmpty);

    auto attrs = range.front.attributes;
    assert(equal(attrs.front.name, "a"));
    assert(equal(attrs.front.value, "42"));
    assert(attrs.front.pos == TextPos(1, 7));
    attrs.popFront();

    assert(equal(attrs.front.name, "q"));
    assert(equal(attrs.front.value, "29"));
    assert(attrs.front.pos == TextPos(1, 14));
    attrs.popFront();

    assert(equal(attrs.front.name, "w"));
    assert(equal(attrs.front.value, "hello"));
    assert(attrs.front.pos == TextPos(1, 21));
    attrs.popFront();

    assert(attrs.empty);
}
@property SliceOfR text();
Returns the textual value of this Entity.
In the case of EntityType.pi, this is the text that follows the name, whereas in the other cases, the text is the entire contents of the entity (save for the delimeters on the ends if that entity has them).
Supported EntityTypes:
cdata
comment
pi
text
Examples:
import std.range.primitives : empty;

auto xml = "<?xml version='1.0'?>\n" ~
           "<?instructionName?>\n" ~
           "<?foo here is something to say?>\n" ~
           "<root>\n" ~
           "    <![CDATA[ Yay! random text >> << ]]>\n" ~
           "    <!-- some random comment -->\n" ~
           "    <p>something here</p>\n" ~
           "    <p>\n" ~
           "       something else\n" ~
           "       here</p>\n" ~
           "</root>";
auto range = parseXML(xml);

// "<?instructionName?>\n" ~
assert(range.front.type == EntityType.pi);
assert(range.front.name == "instructionName");
assert(range.front.text.empty);

// "<?foo here is something to say?>\n" ~
range.popFront();
assert(range.front.type == EntityType.pi);
assert(range.front.name == "foo");
assert(range.front.text == "here is something to say");

// "<root>\n" ~
range.popFront();
assert(range.front.type == EntityType.elementStart);

// "    <![CDATA[ Yay! random text >> << ]]>\n" ~
range.popFront();
assert(range.front.type == EntityType.cdata);
assert(range.front.text == " Yay! random text >> << ");

// "    <!-- some random comment -->\n" ~
range.popFront();
assert(range.front.type == EntityType.comment);
assert(range.front.text == " some random comment ");

// "    <p>something here</p>\n" ~
range.popFront();
assert(range.front.type == EntityType.elementStart);
assert(range.front.name == "p");

range.popFront();
assert(range.front.type == EntityType.text);
assert(range.front.text == "something here");

range.popFront();
assert(range.front.type == EntityType.elementEnd);
assert(range.front.name == "p");

// "    <p>\n" ~
// "       something else\n" ~
// "       here</p>\n" ~
range.popFront();
assert(range.front.type == EntityType.elementStart);

range.popFront();
assert(range.front.type == EntityType.text);
assert(range.front.text == "\n       something else\n       here");

range.popFront();
assert(range.front.type == EntityType.elementEnd);

// "</root>"
range.popFront();
assert(range.front.type == EntityType.elementEnd);

range.popFront();
assert(range.empty);
@property Entity front();
Returns the Entity representing the entity in the XML document which was most recently parsed.
void popFront();
Move to the next entity.
The next entity is the next one that is linearly in the XML document. So, if the current entity has child entities, the next entity will be the first child entity, whereas if it has no child entities, it will be the next entity at the same level.
Throws: XMLParsingException on invalid XML.
const pure nothrow @nogc @property @safe bool empty();
Whether the end of the XML document has been reached.
Note that because an XMLParsingException will be thrown an invalid XML, it's actually possible to call front and popFront without checking empty if the only way that empty would be true is if the XML were invalid (e.g. if at a start tag, it's a given that there's at least one end tag left in the document unless it's invalid XML).
However, of course, caution should be used to ensure that incorrect assumptions are not made that allow the document to reach its end earlier than predicted without throwing an XMLParsingException, since it's still an error to call front or popFront if empty would return false.
@property auto save();
Forward range function for obtaining a copy of the range which can then be iterated independently of the original.
EntityRange takeNone();
Returns an empty range. This corresponds to std.range.takeNone except that it doesn't create a wrapper type.
template isAttrRange(R)
Whether the given type is a forward range of attributes.
Essentially, an attribute range must be a forward range where
  • each element has the members name, value, and pos
  • name and value are forward ranges of characters
  • name and value have the same type
  • pos is a TextPos
Normally, an attribute range would come from EntityRange.Entity.attributes or DOMEntity.attributes, but as long as a range has the correct API, it qualifies as an attribute range.
Examples:
import std.typecons : Tuple;
import dxml.dom : parseDOM;

alias R1 = typeof(parseXML("<root/>").front.attributes);
static assert(isAttrRange!R1);

alias R2 = typeof(parseDOM("<root/>").children[0].attributes);
static assert(isAttrRange!R2);

alias T = Tuple!(string, "name", string, "value", TextPos, "pos");
static assert(isAttrRange!(T[]));

static assert(!isAttrRange!string);
void getAttrs(R, Args...)(R attrRange, Args args)
if(isAttrRange!R && (Args.length % 2 == 0));
void getAttrs(R, OR, Args...)(R attrRange, ref OR unmatched, Args args)
if(isAttrRange!R && isOutputRange!(OR, ElementType!R) && (Args.length % 2 == 0));
A helper function for processing start tag attributes.
It functions similarly to std.getopt.getopt. It takes a range of attributes and a list of alternating strings and pointers where each string represents the name of the attribute to parse and the pointer immediately after it is assigned the value that corresponds to the attribute name (if present). If the given pointer does not point to the same type as the range of characters used in the attributes, then std.conv.to is used to convert the value to the type the pointer points to.
If a Nullable!T* is given rather than a T*, then it will be treated the same as if it had been T*. So, to!T will be used to convert the attribute value if the matching attribute name is present. The advantage of passing Nullable!T* instead of T* is that it's possible to distinguish between an attribute that wasn't present and one where it was present but was equivalent to T.init.
Unlike std.getopt.getopt, the given range is consumed rather than taking it by ref and leaving the attributes that weren't matched in the range (since that really doesn't work with an arbitrary range as opposed to a dynamic array). However, if the second argument of getAttrs is not a string but is instead an output range that accepts the element type of the range, then any attributes which aren't matched are put into the output range.
Parameters:
R attrRange A range of attributes (see isAttrRange).
OR unmatched An output range that any unmatched attributes from the range are put into (optional argument).
Args args An alternating list of strings and pointers where the names represent the attribute names to get the value of, and the corresponding values get assigned to what the pointers point to.
Throws: XMLParsingException if std.conv.to fails to convert an attribute value.
R skipContents(R)(R entityRange)
if(isInstanceOf!(EntityRange, R));
Takes an EntityRange which is at a start tag and iterates it until it is at its corresponding end tag. It is an error to call skipContents when the current entity is not EntityType.elementStart.
Supported EntityTypes:
elementStart
Returns: The range with its front now at the end tag corresponding to the start tag that was front when the function was called.
Throws: XMLParsingException on invalid XML.
Examples:
auto xml = "<root>\n" ~
           "    <foo>\n" ~
           "        <bar>\n" ~
           "        Some text\n" ~
           "        </bar>\n" ~
           "    </foo>\n" ~
           "    <!-- no comment -->\n" ~
           "</root>";

auto range = parseXML(xml);
assert(range.front.type == EntityType.elementStart);
assert(range.front.name == "root");

range.popFront();
assert(range.front.type == EntityType.elementStart);
assert(range.front.name == "foo");

range = range.skipContents();
assert(range.front.type == EntityType.elementEnd);
assert(range.front.name == "foo");

range.popFront();
assert(range.front.type == EntityType.comment);
assert(range.front.text == " no comment ");

range.popFront();
assert(range.front.type == EntityType.elementEnd);
assert(range.front.name == "root");

range.popFront();
assert(range.empty);
R skipToEntityType(R)(R entityRange, EntityType[] entityTypes...)
if(isInstanceOf!(EntityRange, R));
Skips entities until the given EntityType is reached.
If multiple EntityTypes are given, then any one of them counts as a match.
The current entity is skipped regardless of whether it is the given EntityType.
This is essentially a slightly optimized equivalent to
if(!range.empty())
{
    range.popFront();
    range = range.find!((a, b) => a.type == b.type)(entityTypes);
}
Returns: The given range with its front now at the first entity which matched one of the given EntityTypes or an empty range if none were found.
Throws: XMLParsingException on invalid XML.
Examples:
auto xml = "<root>\n" ~
           "    <!-- blah blah blah -->\n" ~
           "    <foo>nothing to say</foo>\n" ~
           "</root>";

auto range = parseXML(xml);
assert(range.front.type == EntityType.elementStart);
assert(range.front.name == "root");

range = range.skipToEntityType(EntityType.elementStart,
                               EntityType.elementEmpty);
assert(range.front.type == EntityType.elementStart);
assert(range.front.name == "foo");

assert(range.skipToEntityType(EntityType.comment).empty);

// skipToEntityType will work on an empty range but will always
// return an empty range.
assert(range.takeNone().skipToEntityType(EntityType.comment).empty);
R skipToParentEndTag(R)(R entityRange)
if(isInstanceOf!(EntityRange, R));
Skips entities until the end tag is reached that corresponds to the start tag that is the parent of the current entity.
Returns: The given range with its front now at the end tag which corresponds to the parent start tag of the entity that was front when skipToParentEndTag was called. If the current entity does not have a parent start tag (which means that it's either the root element or a comment or PI outside of the root element), then an empty range is returned.
Throws: XMLParsingException on invalid XML.
Examples:
auto xml = "<root>\n" ~
           "    <foo>\n" ~
           "        <!-- comment -->\n" ~
           "        <bar>exam</bar>\n" ~
           "    </foo>\n" ~
           "    <!-- another comment -->\n" ~
           "</root>";
{
    auto range = parseXML(xml);
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "root");

    range.popFront();
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "foo");

    range.popFront();
    assert(range.front.type == EntityType.comment);
    assert(range.front.text == " comment ");

    range = range.skipToParentEndTag();
    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "foo");

    range = range.skipToParentEndTag();
    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "root");

    range = range.skipToParentEndTag();
    assert(range.empty);
}
{
    auto range = parseXML(xml);
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "root");

    range.popFront();
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "foo");

    range.popFront();
    assert(range.front.type == EntityType.comment);
    assert(range.front.text == " comment ");

    range.popFront();
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "bar");

    range.popFront();
    assert(range.front.type == EntityType.text);
    assert(range.front.text == "exam");

    range = range.skipToParentEndTag();
    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "bar");

    range = range.skipToParentEndTag();
    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "foo");

    range.popFront();
    assert(range.front.type == EntityType.comment);
    assert(range.front.text == " another comment ");

    range = range.skipToParentEndTag();
    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "root");

    assert(range.skipToParentEndTag().empty);
}
{
    auto range = parseXML("<root><foo>bar</foo></root>");
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "root");
    assert(range.skipToParentEndTag().empty);
}
R skipToPath(R)(R entityRange, string path)
if(isInstanceOf!(EntityRange, R));
Treats the given string like a file path except that each directory corresponds to the name of a start tag. Note that this does not try to implement XPath as that would be quite complicated, and it really doesn't fit with a StAX parser.
A start tag should be thought of as a directory, with its child start tags as the directories it contains.
All paths should be relative. EntityRange can only move forward through the document, so using an absolute path would only make sense at the beginning of the document. As such, absolute paths are treated as invalid paths.
"./" and "../" are supported. Repeated slashes such as in "foo//bar" are not supported and are treated as an invalid path.
If range.front.type == EntityType.elementStart, then range.skiptoPath("foo") will search for the first child start tag (be it EntityType.elementStart or EntityType.elementEmpty) with the name "foo". That start tag must be a direct child of the current start tag.
If range.front.type is any other EntityType, then range.skipToPath("foo") will return an empty range, because no other EntityTypes have child start tags.
For any EntityType, range.skipToPath("../foo") will search for the first start tag with the name "foo" at the same level as the current entity. If the current entity is a start tag with the name "foo", it will not be considered a match.
range.skipToPath("./") is a no-op. However, range.skipToPath("../") will result in the empty range (since it doesn't target a specific start tag).
range.skipToPath("foo/bar") is equivalent to range.skipToPath("foo").skipToPath("bar"), and range.skipToPath("../foo/bar") is equivalent to range.skipToPath("../foo").skipToPath("bar").
Returns: The given range with its front now at the requested entity if the path is valid; otherwise, an empty range is returned.
Throws: XMLParsingException on invalid XML.
Examples:
{
    auto xml = "<carrot>\n" ~
               "    <foo>\n" ~
               "        <bar>\n" ~
               "            <baz/>\n" ~
               "            <other/>\n" ~
               "        </bar>\n" ~
               "    </foo>\n" ~
               "</carrot>";

    auto range = parseXML(xml);
    // "<carrot>"
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "carrot");

    range = range.skipToPath("foo/bar");
    // "        <bar>
    assert(!range.empty);
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "bar");

    range = range.skipToPath("baz");
    // "            <baz/>
    assert(!range.empty);
    assert(range.front.type == EntityType.elementEmpty);

    // other is not a child element of baz
    assert(range.skipToPath("other").empty);

    range = range.skipToPath("../other");
    // "            <other/>"
    assert(!range.empty);
    assert(range.front.type == EntityType.elementEmpty);
}
{
    auto xml = "<potato>\n" ~
               "    <foo>\n" ~
               "        <bar>\n "~
               "        </bar>\n" ~
               "        <crazy>\n" ~
               "        </crazy>\n" ~
               "        <fou/>\n" ~
               "    </foo>\n" ~
               "    <buzz/>\n" ~
               "</potato>";

    auto range = parseXML(xml);
    // "<potato>"
    assert(range.front.type == EntityType.elementStart);

    range = range.skipToPath("./");
    // "<potato>"
    assert(!range.empty);
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "potato");

    range = range.skipToPath("./foo/bar");
    // "        <bar>"
    assert(!range.empty);
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "bar");

    range = range.skipToPath("../crazy");
    // "        <crazy>"
    assert(!range.empty);
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "crazy");

    // Whether popFront is called here before the call to
    // range.skipToPath("../fou") below, the result is the same, because
    // both <crazy> and </crazy> are at the same level.
    range.popFront();
    // "        </crazy>"
    assert(!range.empty);
    assert(range.front.type == EntityType.elementEnd);
    assert(range.front.name == "crazy");

    range = range.skipToPath("../fou");
    // "        <fou/>"
    assert(!range.empty);
    assert(range.front.type == EntityType.elementEmpty);
}
// Searching stops at the first matching start tag.
{
    auto xml = "<beet>\n" ~
               "    <foo a='42'>\n" ~
               "    </foo>\n" ~
               "    <foo b='451'>\n" ~
               "    </foo>\n" ~
               "</beet>";

    auto range = parseXML(xml);
    range = range.skipToPath("foo");
    assert(!range.empty);
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "foo");

    {
        auto attrs = range.front.attributes;
        assert(attrs.front.name == "a");
        assert(attrs.front.value == "42");
    }

    range = range.skipToPath("../foo");
    assert(!range.empty);
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "foo");

    {
        auto attrs = range.front.attributes;
        assert(attrs.front.name == "b");
        assert(attrs.front.value == "451");
    }
}
// skipToPath will work on an empty range but will always return an
// empty range.
{
    auto range = parseXML("<root/>");
    assert(range.takeNone().skipToPath("nowhere").empty);
}
// Empty and absolute paths will also result in an empty range as will
// "../" without any actual tag name on the end.
{
    auto range = parseXML("<root/>");
    assert(range.skipToPath("").empty);
    assert(range.skipToPath("/").empty);
    assert(range.skipToPath("../").empty);
}
// Only non-empty start tags have children; all other EntityTypes result
// in an empty range unless "../" is used.
{
    auto xml = "<!-- comment -->\n" ~
               "<root>\n" ~
               "    <foo/>\n" ~
               "</root>";
    auto range = parseXML(xml);
    assert(range.skipToPath("root").empty);
    assert(range.skipToPath("foo").empty);

    range = range.skipToPath("../root");
    assert(!range.empty);
    assert(range.front.type == EntityType.elementStart);
    assert(range.front.name == "root");
}