dxml.parser
This implements a range-based
StAX parser for XML 1.0 (which
will work with XML 1.1 documents assuming that they don't use any
1.1-specific features). For the sake of simplicity, sanity, and efficiency,
the DTD
section is not supported beyond what is required to parse past it.
Start tags, end tags, comments, cdata sections, and processing instructions
are all supported and reported to the application. Anything in the DTD is
skipped (though it's parsed enough to parse past it correctly, and that
can result in an XMLParsingException if that XML isn't valid
enough to be correctly skipped), and the
XML declaration at the
top is skipped if present (XML 1.1 requires that it be there, but XML 1.0
does not).
Regardless of what the XML declaration says (if present), any range of
char will be treated as being encoded in UTF-8, any range of
wchar will be treated as being encoded in UTF-16, and any range of
dchar will be treated as having been encoded in UTF-32. Strings will
be treated as ranges of their code units, not code points. Note that like
Phobos typically does when processing strings, the code assumes that BOMs
have already been removed, so if the range of characters comes from a file
that uses a BOM, the calling code needs to strip it out before calling
parseXML, or parsing will fail due to invalid characters.
Since the DTD is skipped, entity references other than the five which are
predefined by the XML spec cannot be fully processed (since wherever they
were used in the document would be replaced by what they referred to, which
could be arbitrarily complex XML). As such, by default, if any entity
references which are not predefined are encountered outside of the DTD, an
XMLParsingException will be thrown (see
Config.throwOnEntityRef for how that can be configured). The
predefined entity references and any character references encountered will
be checked to verify that they're valid, but they will not be replaced
(since that does not work with returning slices of the original input).
However, decodeXML or
parseStdEntityRef from
dxml.util can be used to convert the predefined entity references
to what the refer to, and decodeXML or
parseCharRef from
dxml.util can be used to convert character references to what they
refer to.
Primary Symbols
Symbol | Description |
---|---|
parseXML | The function used to initiate the parsing of an XML document. |
EntityRange | The range returned by parseXML. |
EntityRange.Entity | The element type of EntityRange. |
Parser Configuration Helpers
Symbol | Description |
---|---|
Config | Used to configure how EntityRange parses the XML. |
simpleXML | A user-friendly configuration for when the application just wants the element tags and the data in between them. |
makeConfig | A convenience function for constructing a custom Config. |
SkipComments | A std.typecons.Flag used with Config
to tell the parser to skip comments. |
SkipPI | A std.typecons.Flag used with Config
to tell the parser to skip processing instructions. |
SplitEmpty | A std.typecons.Flag used with Config
to configure how the parser deals with empty element tags. |
Helper Types Used When Parsing
Symbol | Description |
---|---|
EntityType | The type of an entity in the XML (e.g. a start tag or a comment). |
TextPos | Gives the line and column number in the XML document. |
XMLParsingException | Thrown by EntityRange when it encounters invalid XML. |
Helper Functions Used When Parsing
Symbol | Description |
---|---|
getAttrs | A function similar to std.getopt.getopt which allows for the easy processing of start tag attributes. |
skipContents | Iterates an EntityRange from a start tag to its matching end tag. |
skipToPath | Used to navigate from one start tag to another as if the start tag names formed a file path. |
skipToEntityType | Skips to the next entity of the given type in the range. |
skipToParentEndTag | Iterates an EntityRange until it reaches the end tag that matches the start tag which is the parent of the current entity. |
Helper Traits
Symbol | Description |
---|---|
isAttrRange | Whether the given range is a range of attributes. |
License:
Boost License 1.0.
See Also:
Official Specification for XML 1.0
Examples:
auto xml = "<!-- comment -->\n" ~ "<root>\n" ~ " <foo>some text<whatever/></foo>\n" ~ " <bar/>\n" ~ " <baz></baz>\n" ~ "</root>"; { auto range = parseXML(xml); assert(range.front.type == EntityType.comment); assert(range.front.text == " comment "); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "foo"); range.popFront(); assert(range.front.type == EntityType.text); assert(range.front.text == "some text"); range.popFront(); assert(range.front.type == EntityType.elementEmpty); assert(range.front.name == "whatever"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "foo"); range.popFront(); assert(range.front.type == EntityType.elementEmpty); assert(range.front.name == "bar"); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "baz"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "baz"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "root"); range.popFront(); assert(range.empty); } { auto range = parseXML!simpleXML(xml); // simpleXML skips comments assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "foo"); range.popFront(); assert(range.front.type == EntityType.text); assert(range.front.text == "some text"); range.popFront(); // simpleXML splits empty element tags into a start tag and end tag // so that the code doesn't have to care whether a start tag with no // content is an empty tag or a start tag and end tag with nothing but // whitespace in between. assert(range.front.type == EntityType.elementStart); assert(range.front.name == "whatever"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "whatever"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "foo"); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "bar"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "bar"); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "baz"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "baz"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "root"); range.popFront(); assert(range.empty); }
- class
XMLParsingException
: object.Exception; - The exception type thrown when the XML parser encounters invalid XML.
- TextPos
pos
; - The position in the XML input where the problem is.
- struct
TextPos
; - Where in the XML document an entity is.The line and column numbers are 1-based. The primary use case for
TextPos
is XMLParsingException, but an application may have other uses for it. TheTextPos
for an Entity can be obtained from Entity.pos.- int
line
; - A
line
number in the XML file. - int
col
; - A column number in a line of the XML file.Each code unit is considered a column, so depending on what a program is looking to do with the column number, it may need to examine the actual text on that line and calculate the number that represents what the program wants to display (e.g. the number of graphemes).
- struct
Config
; - Used to configure how the parser works.
- Flag
skipComments
; - Whether the comments should be skipped while parsing.If
skipComments
== SkipComments.yes, any entities of type EntityType.comment will be omitted from the parsing results, and they will not be validated beyond what is required to parse past them. Defaults to SkipComments.no. - Flag
skipPI
; - Whether processing instructions should be skipped.If
skipPI
== SkipPI.yes, any entities of type EntityType.pi will be skipped, and they will not be validated beyond what is required to parse past them. Defaults to SkipPI.no. - Flag
splitEmpty
; - Whether the parser should report empty element tags as if they were a start tag followed by an end tag with nothing in between.If
splitEmpty
== SplitEmpty.yes, then whenever an EntityType.elementEmpty is encountered, the parser will claim that that entity is an EntityType.elementStart, and then it will provide an EntityType.elementEnd as the next entity before the entity that actually follows it. The purpose of this is to simplify the code using the parser, since most code does not care about the difference between an empty tag and a start and end tag with nothing in between. But since some code may care about the difference, the behavior is configurable. Defaults to SplitEmpty.no.Examples:enum configSplitYes = makeConfig(SplitEmpty.yes); { auto range = parseXML("<root></root>"); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "root"); range.popFront(); assert(range.empty); } { // No difference if the tags are already split. auto range = parseXML!configSplitYes("<root></root>"); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "root"); range.popFront(); assert(range.empty); } { // This treats <root></root> and <root/> as distinct. auto range = parseXML("<root/>"); assert(range.front.type == EntityType.elementEmpty); assert(range.front.name == "root"); range.popFront(); assert(range.empty); } { // This is parsed as if it were <root></root> insead of <root/>. auto range = parseXML!configSplitYes("<root/>"); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "root"); range.popFront(); assert(range.empty); }
- Flag
throwOnEntityRef
; - Whether the parser should throw when it encounters any entity references other than the five entity references defined in the XML standard.Any other entity references would have to be defined in the DTD in order to be valid. And in order to know what XML they represent (which could be arbitrarily complex, even effectively inserting entire XML documents into the middle of the XML), the DTD would have to be parsed. However, dxml does not support parsing the DTD beyond what is required to correctly parse past it, and replacing entity references with what they represent would not work with the slicing semantics that EntityRange provides. As such, it is not possible for dxml to correctly handle any entity references other than the five which are defined in the XML standard, and even those are only parsed by using dxml.util.decodeXML or dxml.util.parseStdEntityRef. EntityRange always validates that entity references are one of the five, predefined entity references, but otherwise, it lets them pass through as normal text. It does not replace them with what they represent. As such, the default behavior of EntityRange is to throw an XMLParsingException when it encounters an entity reference which is not one of the five defined by the XML standard. With that behavior, there is no risk of processing an XML document as if it had no entity references and ending up with what the program using the parser would probably consider incorrect results. However, there are cases where a program may find it acceptable to treat entity references as normal text and ignore them. As such, if a program wishes to take that approach, it can set
throwOnEntityRef
to ThrowOnEntityRef.no. IfthrowOnEntityRef
== ThrowOnEntityRef.no, then any entity reference that it encounters will be validated to ensure that it is syntactically valid (i.e. that the characters it contains form what could be a valid entity reference assuming that the DTD declared it properly), but otherwise, EntityRange will treat it as normal text, just like it treats the five, predefined entity references as normal text. Note that any valid XML entity reference which contains start or end tags must contain matching start or end tags, and entity references cannot contain incomplete fragments of XML (e.g. the start or end of a comment). So, missing entity references should only affect the data in the XML document and not its overall structure (if that were not true, attempting to ignore entity references such as ThrowOnEntityRef.no does would be a disaster in the making). However, how reasonable it is to miss that data depends entirely on the application and what the XML documents it's parsing contain - hence, the behavior is configurable.See Also: dxml.util.StdEntityRef
dxml.util.parseStdEntityRef
dxml.util.parseCharRef
dxml.util.encodeCharRef
dxml.util.decodeXML
dxml.util.asDecodedXMLExamples:import std.exception : assertThrown; import dxml.util : decodeXML; auto xml = "<root>\n" ~ " <std>&'><"</std>\n" ~ " <other>&foobar;</other>\n" ~ " <invalid>&--;</invalid>\n" ~ "</root>"; // ThrowOnEntityRef.yes { auto range = parseXML(xml); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "std"); range.popFront(); assert(range.front.type == EntityType.text); assert(range.front.text == "&'><""); assert(range.front.text.decodeXML() == `&'><"`); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "std"); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "other"); // Attempted to parse past "&foobar;", which is syntactically // valid, but it's not one of the five predefined entity references. assertThrown!XMLParsingException(range.popFront()); } // ThrowOnEntityRef.no { auto range = parseXML!(makeConfig(ThrowOnEntityRef.no))(xml); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "std"); range.popFront(); assert(range.front.type == EntityType.text); assert(range.front.text == "&'><""); assert(range.front.text.decodeXML() == `&'><"`); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "std"); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "other"); // Doesn't throw, because "&foobar;" is syntactically valid. range.popFront(); assert(range.front.type == EntityType.text); assert(range.front.text == "&foobar;"); // decodeXML has no effect on non-standard entity references. assert(range.front.text.decodeXML() == "&foobar;"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "other"); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "invalid"); // Attempted to parse past "&--;", which is not syntactically valid, // because -- is not a valid name for an entity reference. assertThrown!XMLParsingException(range.popFront()); }
- alias
SkipComments
= std.typecons.Flag!"SkipComments
".Flag; - See Also: skipComments
- alias
SkipPI
= std.typecons.Flag!"SkipPI
".Flag; - See Also: skipPI
- alias
SplitEmpty
= std.typecons.Flag!"SplitEmpty
".Flag; - See Also: splitEmpty
- alias
ThrowOnEntityRef
= std.typecons.Flag!"ThrowOnEntityRef
".Flag; - See Also: throwOnEntityRef
- Config
makeConfig
(Args...)(Args args); - Helper function for creating a custom config. It makes it easy to set one or more of the member variables to something other than the default without having to worry about explicitly setting them individually or setting them all at once via a constructor.The order of the arguments does not matter. The types of each of the members of Config are unique, so that information alone is sufficient to determine which argument should be assigned to which member.Examples:
{ auto config = makeConfig(SkipComments.yes); assert(config.skipComments == SkipComments.yes); assert(config.skipPI == Config.init.skipPI); assert(config.splitEmpty == Config.init.splitEmpty); assert(config.throwOnEntityRef == Config.init.throwOnEntityRef); } { auto config = makeConfig(SkipComments.yes, SkipPI.yes); assert(config.skipComments == SkipComments.yes); assert(config.skipPI == SkipPI.yes); assert(config.splitEmpty == Config.init.splitEmpty); assert(config.throwOnEntityRef == Config.init.throwOnEntityRef); } { auto config = makeConfig(SplitEmpty.yes, SkipComments.yes, ThrowOnEntityRef.no); assert(config.skipComments == SkipComments.yes); assert(config.skipPI == Config.init.skipPI); assert(config.splitEmpty == SplitEmpty.yes); assert(config.throwOnEntityRef == ThrowOnEntityRef.no); }
- enum Config
simpleXML
; - This Config is intended for making it easy to parse XML by skipping everything that isn't the actual data as well as making it simpler to deal with empty element tags by treating them the same as a start tag and end tag with nothing but whitespace between them.Examples:
static assert(simpleXML.skipComments == SkipComments.yes); static assert(simpleXML.skipPI == SkipPI.yes); static assert(simpleXML.splitEmpty == SplitEmpty.yes); static assert(simpleXML.throwOnEntityRef == ThrowOnEntityRef.yes);
- enum
EntityType
: int; - Represents the type of an XML entity. Used by EntityRange.Entity.
cdata
- A
cdata
section: <![CDATA[ ... ]]>. comment
- An XML
comment
: <!-- ... -->. elementStart
- The start tag for an element. e.g. <foo name="value">.
elementEnd
- The end tag for an element. e.g. </foo>.
elementEmpty
- The tag for an element with no contents or matching end tag. e.g. <foo name="value"/>.
pi
- A processing instruction such as <?foo?>. Note that the <?xml ... ?> is skipped and not treated as an EntityType.pi.See Also: http://www.w3.org/TR/REC-xml/#sec-pi
text
- The content of an element tag that is simple
text
.If there is an entity other than the end tag following thetext
, then thetext
includes up to that entity. Note however that character references (e.g. "*") and the predefined entity references (e.g. "'") are left unprocessed in thetext
. In order for them to be processed, thetext
should be passed to either decodeXML or asDecodedXML. Entity references which are not predefined are considered invalid XML, because the DTD section is skipped, and thus they cannot be processed properly.
- struct
EntityRange
(Config cfg, R) if (isForwardRange!R && isSomeChar!(ElementType!R)); EntityRange!(config, R)parseXML
(Config config = Config.init, R)(R xmlText)
if(isForwardRange!R && isSomeChar!(ElementType!R)); - Lazily parses the given range of characters as an XML document.
EntityRange
is essentially a StAX parser, though it evolved into that rather than being based on what Java did, and it's range-based rather than iterator-based, so its API is likely to differ from other implementations. The basic concept should be the same though. One of the core design goals of this parser is to slice the original input rather than having to allocate strings for the output or wrap it in a lazy range that produces a mutated version of the data. So, all of the text that the parser provides is either a slice or std.range.takeExactly of the input. However, in some cases, for the parser to be fully compliant with the XML spec, dxml.util.decodeXML must be called on the text to mutate certain constructs (e.g. removing any '\r' in the text or converting "<" to '<'). But that's left up to the application. The parser is not @nogc, but it allocates memory very minimally. It allocates some of its state on the heap so it can validate attributes and end tags. However, that state is shared among all the ranges that came from the same call toparseXML
(only the range farthest along in parsing validates attributes or end tags), so save does not allocate memory unless save on the underlying range allocates memory. The shared state currently uses a couple of dynamic arrays to validate the tags and attributes, and if the document has a particularly deep tag depth or has a lot of attributes on a start tag, then some reallocations may occur until the maximum is reached, but enough is reserved that for most documents, no reallocations will occur. The only other times that the parser would allocate would be if an exception were thrown or if the range that was passed toparseXML
allocates for any reason when calling any of the range primitives. If invalid XML is encountered at any point during the parsing process, an XMLParsingException will be thrown. If an exception has been thrown, then the parser is in an invalid state, and it is an error to call any functions on it. However, note that XML validation is reduced for any entities that are skipped (e.g. for anything in the DTD, validation is reduced to what is required to correctly parse past it, and when Config.skipPI == SkipPI.yes, processing instructions are only validated enough to correctly skip past them). As the module documentation says, this parser does not provide any DTD support. It is not possible to properly support the DTD while returning slices of the original input, and the DTD portion of the spec makes parsing XML far, far more complicated. A quick note about carriage returns: per the XML spec, they are all supposed to either be stripped out or replaced with newlines or spaces before the XML parser even processes the text. That doesn't work when the parser is slicing the original text and not mutating it at all. So, for the purposes of parsing, this parser treats all carriage returns as if they were newlines or spaces (though they won't count as newlines when counting the lines for TextPos). However, they will appear in any text fields or attribute values if they are in the document (since the text fields and attribute values are slices of the original text). dxml.util.decodeXML can be used to strip them along with converting any character references in the text. Alternatively, the application can remove them all before callingparseXML
, but it's not necessary.Examples:import std.range.primitives : walkLength; auto xml = "<?xml version='1.0'?>\n" ~ "<?instruction start?>\n" ~ "<foo attr='42'>\n" ~ " <bar/>\n" ~ " <!-- no comment -->\n" ~ " <baz hello='world'>\n" ~ " nothing to say.\n" ~ " nothing at all...\n" ~ " </baz>\n" ~ "</foo>\n" ~ "<?some foo?>"; { auto range = parseXML(xml); assert(range.front.type == EntityType.pi); assert(range.front.name == "instruction"); assert(range.front.text == "start"); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "foo"); { auto attrs = range.front.attributes; assert(walkLength(attrs.save) == 1); assert(attrs.front.name == "attr"); assert(attrs.front.value == "42"); } range.popFront(); assert(range.front.type == EntityType.elementEmpty); assert(range.front.name == "bar"); range.popFront(); assert(range.front.type == EntityType.comment); assert(range.front.text == " no comment "); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "baz"); { auto attrs = range.front.attributes; assert(walkLength(attrs.save) == 1); assert(attrs.front.name == "hello"); assert(attrs.front.value == "world"); } range.popFront(); assert(range.front.type == EntityType.text); assert(range.front.text == "\n nothing to say.\n nothing at all...\n "); range.popFront(); assert(range.front.type == EntityType.elementEnd); // </baz> range.popFront(); assert(range.front.type == EntityType.elementEnd); // </foo> range.popFront(); assert(range.front.type == EntityType.pi); assert(range.front.name == "some"); assert(range.front.text == "foo"); range.popFront(); assert(range.empty); } { auto range = parseXML!simpleXML(xml); // simpleXML is set to skip processing instructions. assert(range.front.type == EntityType.elementStart); assert(range.front.name == "foo"); { auto attrs = range.front.attributes; assert(walkLength(attrs.save) == 1); assert(attrs.front.name == "attr"); assert(attrs.front.value == "42"); } // simpleXML is set to split empty tags so that <bar/> is treated // as the same as <bar></bar> so that code does not have to // explicitly handle empty tags. range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "bar"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "bar"); // simpleXML is set to skip comments. range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "baz"); { auto attrs = range.front.attributes; assert(walkLength(attrs.save) == 1); assert(attrs.front.name == "hello"); assert(attrs.front.value == "world"); } range.popFront(); assert(range.front.type == EntityType.text); assert(range.front.text == "\n nothing to say.\n nothing at all...\n "); range.popFront(); assert(range.front.type == EntityType.elementEnd); // </baz> range.popFront(); assert(range.front.type == EntityType.elementEnd); // </foo> range.popFront(); assert(range.empty); }
- alias
config
= cfg; - The Config used for when parsing the XML.
- alias
Input
= R; - The type of the range that EntityRange is parsing.
- alias
SliceOfR
= R; - The type used when any slice of the original input is used. If R is a string or supports slicing, then
SliceOfR
is the same as R; otherwise, it's the result of calling std.range.takeExactly on the input.import std.algorithm : filter; import std.range : takeExactly; static assert(is(EntityRange!(Config.init, string).SliceOfR == string)); auto range = filter!(a => true)("some xml"); static assert(is(EntityRange!(Config.init, typeof(range)).SliceOfR == typeof(takeExactly(range, 42))));
- struct
Entity
; - Represents an entity in the XML document.Note that the type determines which properties can be used, and it can determine whether functions which an
Entity
or EntityRange is passed to are allowed to be called. Each function lists which EntityTypes are allowed, and it is an error to call them with any other EntityType.- alias
Attribute
= Tuple!(SliceOfR, "name", SliceOfR, "value", TextPos, "pos"); - The exact instantiation of std.typecons.Tuple that attributes returns a range of.See Also: attributes
- const pure nothrow @nogc @property @safe EntityType
type
(); - The EntityType for this Entity.Examples:
auto xml = "<root>\n" ~ " <!--no comment-->\n" ~ " <![CDATA[cdata run]]>\n" ~ " <text>I am text!</text>\n" ~ " <empty/>\n" ~ " <?pi?>\n" ~ "</root>"; auto range = parseXML(xml); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); range.popFront(); assert(range.front.type == EntityType.comment); assert(range.front.text == "no comment"); range.popFront(); assert(range.front.type == EntityType.cdata); assert(range.front.text == "cdata run"); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "text"); range.popFront(); assert(range.front.type == EntityType.text); assert(range.front.text == "I am text!"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "text"); range.popFront(); assert(range.front.type == EntityType.elementEmpty); assert(range.front.name == "empty"); range.popFront(); assert(range.front.type == EntityType.pi); assert(range.front.name == "pi"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "root"); range.popFront(); assert(range.empty);
- const pure nothrow @nogc @property @safe TextPos
pos
(); - The position in the the original text where the entity starts.Examples:
auto xml = "<root>\n" ~ " <foo>\n" ~ " Foo and bar. Always foo and bar...\n" ~ " </foo>\n" ~ "</root>"; auto range = parseXML(xml); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); assert(range.front.pos == TextPos(1, 1)); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "foo"); assert(range.front.pos == TextPos(2, 5)); range.popFront(); assert(range.front.type == EntityType.text); assert(range.front.text == "\n" ~ " Foo and bar. Always foo and bar...\n" ~ " "); assert(range.front.pos == TextPos(2, 10)); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "foo"); assert(range.front.pos == TextPos(4, 5)); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "root"); assert(range.front.pos == TextPos(5, 1)); range.popFront(); assert(range.empty);
- @property SliceOfR
name
(); - Gives the
name
of this Entity.Note that this is the directname
in the XML for this entity and does not contain any of the names of any of the parent entities that this entity has. If an application wants the full "path" of the entity, then it will have to keep track of that itself. The parser does not do that as it would require allocating memory.Supported EntityTypes: elementStart elementEnd elementEmpty pi Examples:auto xml = "<root>\n" ~ " <empty/>\n" ~ " <?pi?>\n" ~ "</root>"; auto range = parseXML(xml); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); range.popFront(); assert(range.front.type == EntityType.elementEmpty); assert(range.front.name == "empty"); range.popFront(); assert(range.front.type == EntityType.pi); assert(range.front.name == "pi"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "root"); range.popFront(); assert(range.empty);
- @property auto
attributes
(); - Returns a lazy range of
attributes
for a start tag where each attribute is represented as a
Tuple!( SliceOfR, "name", SliceOfR, "value", TextPos, "pos").Supported EntityTypes: elementStart elementEmpty Examples:import std.algorithm.comparison : equal; import std.algorithm.iteration : filter; { auto xml = "<root/>"; auto range = parseXML(xml); assert(range.front.type == EntityType.elementEmpty); assert(range.front.attributes.empty); static assert(is(ElementType!(typeof(range.front.attributes)) == typeof(range).Entity.Attribute)); } { auto xml = "<root a='42' q='29' w='hello'/>"; auto range = parseXML(xml); assert(range.front.type == EntityType.elementEmpty); auto attrs = range.front.attributes; assert(attrs.front.name == "a"); assert(attrs.front.value == "42"); assert(attrs.front.pos == TextPos(1, 7)); attrs.popFront(); assert(attrs.front.name == "q"); assert(attrs.front.value == "29"); assert(attrs.front.pos == TextPos(1, 14)); attrs.popFront(); assert(attrs.front.name == "w"); assert(attrs.front.value == "hello"); assert(attrs.front.pos == TextPos(1, 21)); attrs.popFront(); assert(attrs.empty); } // Because the type of name and value is SliceOfR, == with a string // only works if the range passed to parseXML was string. { auto xml = filter!(a => true)("<root a='42' q='29' w='hello'/>"); auto range = parseXML(xml); assert(range.front.type == EntityType.elementEmpty); auto attrs = range.front.attributes; assert(equal(attrs.front.name, "a")); assert(equal(attrs.front.value, "42")); assert(attrs.front.pos == TextPos(1, 7)); attrs.popFront(); assert(equal(attrs.front.name, "q")); assert(equal(attrs.front.value, "29")); assert(attrs.front.pos == TextPos(1, 14)); attrs.popFront(); assert(equal(attrs.front.name, "w")); assert(equal(attrs.front.value, "hello")); assert(attrs.front.pos == TextPos(1, 21)); attrs.popFront(); assert(attrs.empty); }
- @property SliceOfR
text
(); - Returns the textual value of this Entity.In the case of EntityType.pi, this is the
text
that follows the name, whereas in the other cases, thetext
is the entire contents of the entity (save for the delimeters on the ends if that entity has them).Supported EntityTypes: cdata comment pi text Examples:import std.range.primitives : empty; auto xml = "<?xml version='1.0'?>\n" ~ "<?instructionName?>\n" ~ "<?foo here is something to say?>\n" ~ "<root>\n" ~ " <![CDATA[ Yay! random text >> << ]]>\n" ~ " <!-- some random comment -->\n" ~ " <p>something here</p>\n" ~ " <p>\n" ~ " something else\n" ~ " here</p>\n" ~ "</root>"; auto range = parseXML(xml); // "<?instructionName?>\n" ~ assert(range.front.type == EntityType.pi); assert(range.front.name == "instructionName"); assert(range.front.text.empty); // "<?foo here is something to say?>\n" ~ range.popFront(); assert(range.front.type == EntityType.pi); assert(range.front.name == "foo"); assert(range.front.text == "here is something to say"); // "<root>\n" ~ range.popFront(); assert(range.front.type == EntityType.elementStart); // " <![CDATA[ Yay! random text >> << ]]>\n" ~ range.popFront(); assert(range.front.type == EntityType.cdata); assert(range.front.text == " Yay! random text >> << "); // " <!-- some random comment -->\n" ~ range.popFront(); assert(range.front.type == EntityType.comment); assert(range.front.text == " some random comment "); // " <p>something here</p>\n" ~ range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "p"); range.popFront(); assert(range.front.type == EntityType.text); assert(range.front.text == "something here"); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "p"); // " <p>\n" ~ // " something else\n" ~ // " here</p>\n" ~ range.popFront(); assert(range.front.type == EntityType.elementStart); range.popFront(); assert(range.front.type == EntityType.text); assert(range.front.text == "\n something else\n here"); range.popFront(); assert(range.front.type == EntityType.elementEnd); // "</root>" range.popFront(); assert(range.front.type == EntityType.elementEnd); range.popFront(); assert(range.empty);
- @property Entity
front
(); - Returns the Entity representing the entity in the XML document which was most recently parsed.
- void
popFront
(); - Move to the next entity.The next entity is the next one that is linearly in the XML document. So, if the current entity has child entities, the next entity will be the first child entity, whereas if it has no child entities, it will be the next entity at the same level.Throws: XMLParsingException on invalid XML.
- const pure nothrow @nogc @property @safe bool
empty
(); - Whether the end of the XML document has been reached.Note that because an XMLParsingException will be thrown an invalid XML, it's actually possible to call front and popFront without checking
empty
if the only way thatempty
would betrue
is if the XML were invalid (e.g. if at a start tag, it's a given that there's at least one end tag left in the document unless it's invalid XML). However, of course, caution should be used to ensure that incorrect assumptions are not made that allow the document to reach its end earlier than predicted without throwing an XMLParsingException, since it's still an error to call front or popFront ifempty
would returnfalse
. - @property auto
save
(); - Forward range function for obtaining a copy of the range which can then be iterated independently of the original.
- EntityRange
takeNone
(); - Returns an empty range. This corresponds to std.range.takeNone except that it doesn't create a wrapper type.
- template
isAttrRange
(R) - Whether the given type is a forward range of attributes.Essentially, an attribute range must be a forward range where
- each element has the members name, value, and pos
- name and value are forward ranges of characters
- name and value have the same type
- pos is a TextPos
See Also: EntityRange.Entity.Attribute
EntityRange.Entity.attributes
DOMEntity.Attribute
DOMEntity.attributesExamples:import std.typecons : Tuple; import dxml.dom : parseDOM; alias R1 = typeof(parseXML("<root/>").front.attributes); static assert(isAttrRange!R1); alias R2 = typeof(parseDOM("<root/>").children[0].attributes); static assert(isAttrRange!R2); alias T = Tuple!(string, "name", string, "value", TextPos, "pos"); static assert(isAttrRange!(T[])); static assert(!isAttrRange!string);
- void
getAttrs
(R, Args...)(R attrRange, Args args)
if(isAttrRange!R && (Args.length % 2 == 0)); voidgetAttrs
(R, OR, Args...)(R attrRange, ref OR unmatched, Args args)
if(isAttrRange!R && isOutputRange!(OR, ElementType!R) && (Args.length % 2 == 0)); - A helper function for processing start tag attributes.It functions similarly to std.getopt.getopt. It takes a range of attributes and a list of alternating strings and pointers where each string represents the name of the attribute to parse and the pointer immediately after it is assigned the value that corresponds to the attribute name (if present). If the given pointer does not point to the same type as the range of characters used in the attributes, then std.conv.to is used to convert the value to the type the pointer points to. If a Nullable!T* is given rather than a T*, then it will be treated the same as if it had been T*. So, to!T will be used to convert the attribute value if the matching attribute name is present. The advantage of passing Nullable!T* instead of T* is that it's possible to distinguish between an attribute that wasn't present and one where it was present but was equivalent to T.init. Unlike std.getopt.getopt, the given range is consumed rather than taking it by ref and leaving the attributes that weren't matched in the range (since that really doesn't work with an arbitrary range as opposed to a dynamic array). However, if the second argument of
getAttrs
is not a string but is instead an output range that accepts the element type of the range, then any attributes which aren't matched are put into the output range.Parameters:R attrRange A range of attributes (see isAttrRange). OR unmatched An output range that any unmatched attributes from the range are put into (optional argument). Args args An alternating list of strings and pointers where the names represent the attribute names to get the value of, and the corresponding values get assigned to what the pointers point to. - R
skipContents
(R)(R entityRange)
if(isInstanceOf!(EntityRange, R)); - Takes an EntityRange which is at a start tag and iterates it until it is at its corresponding end tag. It is an error to call
skipContents
when the current entity is not EntityType.elementStart.Supported EntityTypes: elementStart Returns: The range with its front now at the end tag corresponding to the start tag that was front when the function was called.Throws: XMLParsingException on invalid XML.Examples:auto xml = "<root>\n" ~ " <foo>\n" ~ " <bar>\n" ~ " Some text\n" ~ " </bar>\n" ~ " </foo>\n" ~ " <!-- no comment -->\n" ~ "</root>"; auto range = parseXML(xml); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "foo"); range = range.skipContents(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "foo"); range.popFront(); assert(range.front.type == EntityType.comment); assert(range.front.text == " no comment "); range.popFront(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "root"); range.popFront(); assert(range.empty);
- R
skipToEntityType
(R)(R entityRange, EntityType[] entityTypes...)
if(isInstanceOf!(EntityRange, R)); - Skips entities until the given EntityType is reached.If multiple EntityTypes are given, then any one of them counts as a match. The current entity is skipped regardless of whether it is the given EntityType. This is essentially a slightly optimized equivalent to
if(!range.empty()) { range.popFront(); range = range.find!((a, b) => a.type == b.type)(entityTypes); }
Returns: The given range with its front now at the first entity which matched one of the given EntityTypes or an empty range if none were found.Throws: XMLParsingException on invalid XML.Examples:auto xml = "<root>\n" ~ " <!-- blah blah blah -->\n" ~ " <foo>nothing to say</foo>\n" ~ "</root>"; auto range = parseXML(xml); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); range = range.skipToEntityType(EntityType.elementStart, EntityType.elementEmpty); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "foo"); assert(range.skipToEntityType(EntityType.comment).empty); // skipToEntityType will work on an empty range but will always // return an empty range. assert(range.takeNone().skipToEntityType(EntityType.comment).empty);
- R
skipToParentEndTag
(R)(R entityRange)
if(isInstanceOf!(EntityRange, R)); - Skips entities until the end tag is reached that corresponds to the start tag that is the parent of the current entity.Returns: The given range with its front now at the end tag which corresponds to the parent start tag of the entity that was front when
skipToParentEndTag
was called. If the current entity does not have a parent start tag (which means that it's either the root element or a comment or PI outside of the root element), then an empty range is returned.Throws: XMLParsingException on invalid XML.Examples:auto xml = "<root>\n" ~ " <foo>\n" ~ " <!-- comment -->\n" ~ " <bar>exam</bar>\n" ~ " </foo>\n" ~ " <!-- another comment -->\n" ~ "</root>"; { auto range = parseXML(xml); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "foo"); range.popFront(); assert(range.front.type == EntityType.comment); assert(range.front.text == " comment "); range = range.skipToParentEndTag(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "foo"); range = range.skipToParentEndTag(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "root"); range = range.skipToParentEndTag(); assert(range.empty); } { auto range = parseXML(xml); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "foo"); range.popFront(); assert(range.front.type == EntityType.comment); assert(range.front.text == " comment "); range.popFront(); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "bar"); range.popFront(); assert(range.front.type == EntityType.text); assert(range.front.text == "exam"); range = range.skipToParentEndTag(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "bar"); range = range.skipToParentEndTag(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "foo"); range.popFront(); assert(range.front.type == EntityType.comment); assert(range.front.text == " another comment "); range = range.skipToParentEndTag(); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "root"); assert(range.skipToParentEndTag().empty); } { auto range = parseXML("<root><foo>bar</foo></root>"); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); assert(range.skipToParentEndTag().empty); }
- R
skipToPath
(R)(R entityRange, string path)
if(isInstanceOf!(EntityRange, R)); - Treats the given string like a file path except that each directory corresponds to the name of a start tag. Note that this does not try to implement XPath as that would be quite complicated, and it really doesn't fit with a StAX parser.A start tag should be thought of as a directory, with its child start tags as the directories it contains. All paths should be relative. EntityRange can only move forward through the document, so using an absolute path would only make sense at the beginning of the document. As such, absolute paths are treated as invalid paths. "./" and "../" are supported. Repeated slashes such as in "foo//bar" are not supported and are treated as an invalid path. If range.front.type == EntityType.elementStart, then range.skiptoPath("foo") will search for the first child start tag (be it EntityType.elementStart or EntityType.elementEmpty) with the name "foo". That start tag must be a direct child of the current start tag. If range.front.type is any other EntityType, then range.skipToPath("foo") will return an empty range, because no other EntityTypes have child start tags. For any EntityType, range.skipToPath("../foo") will search for the first start tag with the name "foo" at the same level as the current entity. If the current entity is a start tag with the name "foo", it will not be considered a match. range.skipToPath("./") is a no-op. However, range.skipToPath("../") will result in the empty range (since it doesn't target a specific start tag). range.skipToPath("foo/bar") is equivalent to range.skipToPath("foo").skipToPath("bar"), and range.skipToPath("../foo/bar") is equivalent to range.skipToPath("../foo").skipToPath("bar").Returns: The given range with its front now at the requested entity if the path is valid; otherwise, an empty range is returned.Throws: XMLParsingException on invalid XML.Examples:
{ auto xml = "<carrot>\n" ~ " <foo>\n" ~ " <bar>\n" ~ " <baz/>\n" ~ " <other/>\n" ~ " </bar>\n" ~ " </foo>\n" ~ "</carrot>"; auto range = parseXML(xml); // "<carrot>" assert(range.front.type == EntityType.elementStart); assert(range.front.name == "carrot"); range = range.skipToPath("foo/bar"); // " <bar> assert(!range.empty); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "bar"); range = range.skipToPath("baz"); // " <baz/> assert(!range.empty); assert(range.front.type == EntityType.elementEmpty); // other is not a child element of baz assert(range.skipToPath("other").empty); range = range.skipToPath("../other"); // " <other/>" assert(!range.empty); assert(range.front.type == EntityType.elementEmpty); } { auto xml = "<potato>\n" ~ " <foo>\n" ~ " <bar>\n "~ " </bar>\n" ~ " <crazy>\n" ~ " </crazy>\n" ~ " <fou/>\n" ~ " </foo>\n" ~ " <buzz/>\n" ~ "</potato>"; auto range = parseXML(xml); // "<potato>" assert(range.front.type == EntityType.elementStart); range = range.skipToPath("./"); // "<potato>" assert(!range.empty); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "potato"); range = range.skipToPath("./foo/bar"); // " <bar>" assert(!range.empty); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "bar"); range = range.skipToPath("../crazy"); // " <crazy>" assert(!range.empty); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "crazy"); // Whether popFront is called here before the call to // range.skipToPath("../fou") below, the result is the same, because // both <crazy> and </crazy> are at the same level. range.popFront(); // " </crazy>" assert(!range.empty); assert(range.front.type == EntityType.elementEnd); assert(range.front.name == "crazy"); range = range.skipToPath("../fou"); // " <fou/>" assert(!range.empty); assert(range.front.type == EntityType.elementEmpty); } // Searching stops at the first matching start tag. { auto xml = "<beet>\n" ~ " <foo a='42'>\n" ~ " </foo>\n" ~ " <foo b='451'>\n" ~ " </foo>\n" ~ "</beet>"; auto range = parseXML(xml); range = range.skipToPath("foo"); assert(!range.empty); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "foo"); { auto attrs = range.front.attributes; assert(attrs.front.name == "a"); assert(attrs.front.value == "42"); } range = range.skipToPath("../foo"); assert(!range.empty); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "foo"); { auto attrs = range.front.attributes; assert(attrs.front.name == "b"); assert(attrs.front.value == "451"); } } // skipToPath will work on an empty range but will always return an // empty range. { auto range = parseXML("<root/>"); assert(range.takeNone().skipToPath("nowhere").empty); } // Empty and absolute paths will also result in an empty range as will // "../" without any actual tag name on the end. { auto range = parseXML("<root/>"); assert(range.skipToPath("").empty); assert(range.skipToPath("/").empty); assert(range.skipToPath("../").empty); } // Only non-empty start tags have children; all other EntityTypes result // in an empty range unless "../" is used. { auto xml = "<!-- comment -->\n" ~ "<root>\n" ~ " <foo/>\n" ~ "</root>"; auto range = parseXML(xml); assert(range.skipToPath("root").empty); assert(range.skipToPath("foo").empty); range = range.skipToPath("../root"); assert(!range.empty); assert(range.front.type == EntityType.elementStart); assert(range.front.name == "root"); }