roxmltree/
parse.rs

1use alloc::string::{String, ToString};
2use alloc::vec::Vec;
3use core::ops::Range;
4
5use crate::{
6    AttributeData, Document, ExpandedNameIndexed, NamespaceIdx, Namespaces, NodeData, NodeId,
7    NodeKind, ShortRange, StringStorage, TextPos, NS_XMLNS_URI, NS_XML_PREFIX, NS_XML_URI, PI,
8    XMLNS,
9};
10
11use crate::tokenizer::{self, Reference, StrSpan, Stream};
12
13type Result<T> = core::result::Result<T, Error>;
14
15/// A list of all possible errors.
16#[derive(Clone, PartialEq, Eq, Hash, Debug)]
17pub enum Error {
18    /// The `xmlns:xml` attribute must have an <http://www.w3.org/XML/1998/namespace> URI.
19    InvalidXmlPrefixUri(TextPos),
20
21    /// Only the `xmlns:xml` attribute can have the <http://www.w3.org/XML/1998/namespace> URI.
22    UnexpectedXmlUri(TextPos),
23
24    /// The <http://www.w3.org/2000/xmlns/> URI must not be declared.
25    UnexpectedXmlnsUri(TextPos),
26
27    /// `xmlns` can't be used as an element prefix.
28    InvalidElementNamePrefix(TextPos),
29
30    /// A namespace was already defined on this element.
31    DuplicatedNamespace(String, TextPos),
32
33    /// An unknown namespace.
34    ///
35    /// Indicates that an element or an attribute has an unknown qualified name prefix.
36    ///
37    /// The first value is a prefix.
38    UnknownNamespace(String, TextPos),
39
40    /// Incorrect tree structure.
41    ///
42    /// expected, actual, position
43    #[allow(missing_docs)]
44    UnexpectedCloseTag(String, String, TextPos),
45
46    /// Entity value starts with a close tag.
47    ///
48    /// Example:
49    /// ```xml
50    /// <!DOCTYPE test [ <!ENTITY p '</p>'> ]>
51    /// <root>&p;</root>
52    /// ```
53    UnexpectedEntityCloseTag(TextPos),
54
55    /// A reference to an entity that was not defined in the DTD.
56    UnknownEntityReference(String, TextPos),
57
58    /// A malformed entity reference.
59    ///
60    /// A `&` character inside an attribute value or text indicates an entity reference.
61    /// Otherwise, the document is not well-formed.
62    MalformedEntityReference(TextPos),
63
64    /// A possible entity reference loop.
65    ///
66    /// The current depth limit is 10. The max number of references per reference is 255.
67    EntityReferenceLoop(TextPos),
68
69    /// Attribute value cannot have a `<` character.
70    InvalidAttributeValue(TextPos),
71
72    /// An element has a duplicated attributes.
73    ///
74    /// This also includes namespaces resolving.
75    /// So an element like this will lead to an error.
76    /// ```xml
77    /// <e xmlns:n1='http://www.w3.org' xmlns:n2='http://www.w3.org' n1:a='b1' n2:a='b2'/>
78    /// ```
79    DuplicatedAttribute(String, TextPos),
80
81    /// The XML document must have at least one element.
82    NoRootNode,
83
84    /// The root node was opened but never closed.
85    UnclosedRootNode,
86
87    /// An XML document can have only one XML declaration
88    /// and it must be at the start of the document.
89    UnexpectedDeclaration(TextPos),
90
91    /// An XML with DTD detected.
92    ///
93    /// This error will be emitted only when `ParsingOptions::allow_dtd` is set to `false`.
94    DtdDetected,
95
96    /// Indicates that the [`ParsingOptions::nodes_limit`] was reached.
97    NodesLimitReached,
98
99    /// Indicates that too many attributes were parsed.
100    AttributesLimitReached,
101
102    /// Indicates that too many namespaces were parsed.
103    NamespacesLimitReached,
104
105    /// An invalid name.
106    InvalidName(TextPos),
107
108    /// A non-XML character has occurred.
109    ///
110    /// Valid characters are: <https://www.w3.org/TR/xml/#char32>
111    NonXmlChar(char, TextPos),
112
113    /// An invalid/unexpected character.
114    ///
115    /// expected, actual, position
116    InvalidChar(u8, u8, TextPos),
117
118    /// An invalid/unexpected character.
119    ///
120    /// expected, actual, position
121    InvalidChar2(&'static str, u8, TextPos),
122
123    /// An unexpected string.
124    ///
125    /// Contains what string was expected.
126    InvalidString(&'static str, TextPos),
127
128    /// An invalid ExternalID in the DTD.
129    InvalidExternalID(TextPos),
130
131    /// A comment cannot contain `--` or end with `-`.
132    InvalidComment(TextPos),
133
134    /// A Character Data node contains an invalid data.
135    ///
136    /// Currently, only `]]>` is not allowed.
137    InvalidCharacterData(TextPos),
138
139    /// An unknown token.
140    UnknownToken(TextPos),
141
142    /// The steam ended earlier than we expected.
143    ///
144    /// Should only appear on invalid input data.
145    UnexpectedEndOfStream,
146}
147
148impl Error {
149    /// Returns the error position.
150    pub fn pos(&self) -> TextPos {
151        match *self {
152            Error::InvalidXmlPrefixUri(pos) => pos,
153            Error::UnexpectedXmlUri(pos) => pos,
154            Error::UnexpectedXmlnsUri(pos) => pos,
155            Error::InvalidElementNamePrefix(pos) => pos,
156            Error::DuplicatedNamespace(_, pos) => pos,
157            Error::UnknownNamespace(_, pos) => pos,
158            Error::UnexpectedCloseTag(_, _, pos) => pos,
159            Error::UnexpectedEntityCloseTag(pos) => pos,
160            Error::UnknownEntityReference(_, pos) => pos,
161            Error::MalformedEntityReference(pos) => pos,
162            Error::EntityReferenceLoop(pos) => pos,
163            Error::InvalidAttributeValue(pos) => pos,
164            Error::DuplicatedAttribute(_, pos) => pos,
165            Error::NoRootNode => TextPos::new(1, 1),
166            Error::UnclosedRootNode => TextPos::new(1, 1),
167            Error::UnexpectedDeclaration(pos) => pos,
168            Error::DtdDetected => TextPos::new(1, 1),
169            Error::NodesLimitReached => TextPos::new(1, 1),
170            Error::AttributesLimitReached => TextPos::new(1, 1),
171            Error::NamespacesLimitReached => TextPos::new(1, 1),
172            Error::InvalidName(pos) => pos,
173            Error::NonXmlChar(_, pos) => pos,
174            Error::InvalidChar(_, _, pos) => pos,
175            Error::InvalidChar2(_, _, pos) => pos,
176            Error::InvalidString(_, pos) => pos,
177            Error::InvalidExternalID(pos) => pos,
178            Error::InvalidComment(pos) => pos,
179            Error::InvalidCharacterData(pos) => pos,
180            Error::UnknownToken(pos) => pos,
181            Error::UnexpectedEndOfStream => TextPos::new(1, 1),
182        }
183    }
184}
185
186impl core::fmt::Display for Error {
187    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
188        match *self {
189            Error::InvalidXmlPrefixUri(pos) => {
190                write!(f, "'xml' namespace prefix mapped to wrong URI at {}", pos)
191            }
192            Error::UnexpectedXmlUri(pos) => {
193                write!(
194                    f,
195                    "the 'xml' namespace URI is used for not 'xml' prefix at {}",
196                    pos
197                )
198            }
199            Error::UnexpectedXmlnsUri(pos) => {
200                write!(
201                    f,
202                    "the 'xmlns' URI is used at {}, but it must not be declared",
203                    pos
204                )
205            }
206            Error::InvalidElementNamePrefix(pos) => {
207                write!(
208                    f,
209                    "the 'xmlns' prefix is used at {}, but it must not be",
210                    pos
211                )
212            }
213            Error::DuplicatedNamespace(ref name, pos) => {
214                write!(f, "namespace '{}' at {} is already defined", name, pos)
215            }
216            Error::UnknownNamespace(ref name, pos) => {
217                write!(f, "an unknown namespace prefix '{}' at {}", name, pos)
218            }
219            Error::UnexpectedCloseTag(ref expected, ref actual, pos) => {
220                write!(
221                    f,
222                    "expected '{}' tag, not '{}' at {}",
223                    expected, actual, pos
224                )
225            }
226            Error::UnexpectedEntityCloseTag(pos) => {
227                write!(f, "unexpected close tag at {}", pos)
228            }
229            Error::MalformedEntityReference(pos) => {
230                write!(f, "malformed entity reference at {}", pos)
231            }
232            Error::UnknownEntityReference(ref name, pos) => {
233                write!(f, "unknown entity reference '{}' at {}", name, pos)
234            }
235            Error::EntityReferenceLoop(pos) => {
236                write!(f, "a possible entity reference loop is detected at {}", pos)
237            }
238            Error::InvalidAttributeValue(pos) => {
239                write!(f, "unescaped '<' found at {}", pos)
240            }
241            Error::DuplicatedAttribute(ref name, pos) => {
242                write!(f, "attribute '{}' at {} is already defined", name, pos)
243            }
244            Error::NoRootNode => {
245                write!(f, "the document does not have a root node")
246            }
247            Error::UnclosedRootNode => {
248                write!(f, "the root node was opened but never closed")
249            }
250            Error::UnexpectedDeclaration(pos) => {
251                write!(f, "unexpected XML declaration at {}", pos)
252            }
253            Error::DtdDetected => {
254                write!(f, "XML with DTD detected")
255            }
256            Error::NodesLimitReached => {
257                write!(f, "nodes limit reached")
258            }
259            Error::AttributesLimitReached => {
260                write!(f, "more than 2^32 attributes were parsed")
261            }
262            Error::NamespacesLimitReached => {
263                write!(f, "more than 2^16 unique namespaces were parsed")
264            }
265            Error::InvalidName(pos) => {
266                write!(f, "invalid name token at {}", pos)
267            }
268            Error::NonXmlChar(c, pos) => {
269                write!(f, "a non-XML character {:?} found at {}", c, pos)
270            }
271            Error::InvalidChar(expected, actual, pos) => {
272                write!(
273                    f,
274                    "expected '{}' not '{}' at {}",
275                    expected as char, actual as char, pos
276                )
277            }
278            Error::InvalidChar2(expected, actual, pos) => {
279                write!(
280                    f,
281                    "expected {} not '{}' at {}",
282                    expected, actual as char, pos
283                )
284            }
285            Error::InvalidString(expected, pos) => {
286                write!(f, "expected '{}' at {}", expected, pos)
287            }
288            Error::InvalidExternalID(pos) => {
289                write!(f, "invalid ExternalID at {}", pos)
290            }
291            Error::InvalidComment(pos) => {
292                write!(f, "comment at {} contains '--'", pos)
293            }
294            Error::InvalidCharacterData(pos) => {
295                write!(f, "']]>' at {} is not allowed inside a character data", pos)
296            }
297            Error::UnknownToken(pos) => {
298                write!(f, "unknown token at {}", pos)
299            }
300            Error::UnexpectedEndOfStream => {
301                write!(f, "unexpected end of stream")
302            }
303        }
304    }
305}
306
307#[cfg(feature = "std")]
308impl std::error::Error for Error {
309    fn description(&self) -> &str {
310        "an XML parsing error"
311    }
312}
313
314/// Parsing options.
315#[derive(Clone, Copy, PartialEq, Eq, Debug)]
316pub struct ParsingOptions {
317    /// Allow DTD parsing.
318    ///
319    /// When set to `false`, XML with DTD will cause an error.
320    /// Empty DTD block is not an error.
321    ///
322    /// Currently, there is no option to simply skip DTD.
323    /// Mainly because you will get `UnknownEntityReference` error later anyway.
324    ///
325    /// This flag is set to `false` by default for security reasons,
326    /// but `roxmltree` still has checks for billion laughs attack,
327    /// so this is just an extra security measure.
328    ///
329    /// Default: false
330    pub allow_dtd: bool,
331
332    /// Sets the maximum number of nodes to parse.
333    ///
334    /// Useful when dealing with random input to limit memory usage.
335    ///
336    /// Default: u32::MAX (no limit)
337    pub nodes_limit: u32,
338}
339
340// Explicit for readability.
341#[allow(clippy::derivable_impls)]
342impl Default for ParsingOptions {
343    fn default() -> Self {
344        ParsingOptions {
345            allow_dtd: false,
346            nodes_limit: core::u32::MAX,
347        }
348    }
349}
350
351struct TempAttributeData<'input> {
352    prefix: &'input str,
353    local: &'input str,
354    value: StringStorage<'input>,
355    range: Range<usize>,
356    #[allow(unused)] // only used for feature "positions"
357    qname_len: u16,
358    #[allow(unused)] // only used for feature "positions"
359    eq_len: u8,
360}
361
362impl<'input> Document<'input> {
363    /// Parses the input XML string.
364    ///
365    /// We do not support `&[u8]` or `Reader` because the input must be an already allocated
366    /// UTF-8 string.
367    ///
368    /// This is a shorthand for `Document::parse_with_options(data, ParsingOptions::default())`.
369    ///
370    /// # Examples
371    ///
372    /// ```
373    /// let doc = roxmltree::Document::parse("<e/>").unwrap();
374    /// assert_eq!(doc.descendants().count(), 2); // root node + `e` element node
375    /// ```
376    #[inline]
377    pub fn parse(text: &str) -> Result<Document> {
378        Self::parse_with_options(text, ParsingOptions::default())
379    }
380
381    /// Parses the input XML string using to selected options.
382    ///
383    /// We do not support `&[u8]` or `Reader` because the input must be an already allocated
384    /// UTF-8 string.
385    ///
386    /// # Examples
387    ///
388    /// ```
389    /// let opt = roxmltree::ParsingOptions::default();
390    /// let doc = roxmltree::Document::parse_with_options("<e/>", opt).unwrap();
391    /// assert_eq!(doc.descendants().count(), 2); // root node + `e` element node
392    /// ```
393    #[inline]
394    pub fn parse_with_options(text: &str, opt: ParsingOptions) -> Result<Document> {
395        parse(text, opt)
396    }
397}
398
399struct Entity<'input> {
400    name: &'input str,
401    value: StrSpan<'input>,
402}
403
404#[derive(Clone, Copy)]
405struct TagNameSpan<'input> {
406    prefix: &'input str,
407    name: &'input str,
408    pos: usize,
409    prefix_pos: usize,
410}
411
412impl<'input> TagNameSpan<'input> {
413    #[inline]
414    fn new_null() -> Self {
415        Self {
416            prefix: "",
417            name: "",
418            pos: 0,
419            prefix_pos: 0,
420        }
421    }
422}
423
424/// An entity loop detector.
425///
426/// Limits:
427/// - Entities depth is 10.
428/// - Maximum number of entity references per entity reference is 255.
429///
430/// Basically, if a text or an attribute has an entity reference and this reference
431/// has more than 10 nested references - this is an error.
432///
433/// This is useful for simple loops like:
434///
435/// ```text
436/// <!ENTITY a '&b;'>
437/// <!ENTITY b '&a;'>
438/// ```
439///
440/// And, if a text or an attribute has an entity reference and it references more
441/// than 255 references - this is an error.
442///
443/// This is useful for cases like billion laughs attack, where depth can be pretty small,
444/// but the number of references is exponentially increasing:
445///
446/// ```text
447/// <!ENTITY lol "lol">
448/// <!ENTITY lol1 "&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;">
449/// <!ENTITY lol2 "&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;">
450/// <!ENTITY lol3 "&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;">
451/// <!ENTITY lol4 "&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;">
452/// ```
453#[derive(Default)]
454struct LoopDetector {
455    /// References depth.
456    depth: u8,
457    /// Number of references resolved by the root reference.
458    references: u8,
459}
460
461impl LoopDetector {
462    #[inline]
463    fn inc_depth(&mut self, stream: &Stream) -> Result<()> {
464        if self.depth < 10 {
465            self.depth += 1;
466            Ok(())
467        } else {
468            Err(Error::EntityReferenceLoop(stream.gen_text_pos()))
469        }
470    }
471
472    #[inline]
473    fn dec_depth(&mut self) {
474        if self.depth > 0 {
475            self.depth -= 1;
476        }
477
478        // Reset references count after reaching zero depth.
479        if self.depth == 0 {
480            self.references = 0;
481        }
482    }
483
484    #[inline]
485    fn inc_references(&mut self, stream: &Stream) -> Result<()> {
486        if self.depth == 0 {
487            // Allow infinite amount of references at zero depth.
488            Ok(())
489        } else {
490            if self.references == core::u8::MAX {
491                return Err(Error::EntityReferenceLoop(stream.gen_text_pos()));
492            }
493
494            self.references += 1;
495            Ok(())
496        }
497    }
498}
499
500struct Context<'input> {
501    opt: ParsingOptions,
502    namespace_start_idx: usize,
503    current_attributes: Vec<TempAttributeData<'input>>,
504    awaiting_subtree: Vec<NodeId>,
505    parent_prefixes: Vec<&'input str>,
506    entities: Vec<Entity<'input>>,
507    after_text: bool,
508    parent_id: NodeId,
509    tag_name: TagNameSpan<'input>,
510    loop_detector: LoopDetector,
511    doc: Document<'input>,
512}
513
514impl<'input> Context<'input> {
515    fn append_node(&mut self, kind: NodeKind<'input>, range: Range<usize>) -> Result<NodeId> {
516        if self.doc.nodes.len() >= self.opt.nodes_limit as usize {
517            return Err(Error::NodesLimitReached);
518        }
519
520        #[cfg(not(feature = "positions"))]
521        let _ = range;
522
523        let new_child_id = NodeId::from(self.doc.nodes.len());
524
525        let appending_element = matches!(kind, NodeKind::Element { .. });
526        self.doc.nodes.push(NodeData {
527            parent: Some(self.parent_id),
528            prev_sibling: None,
529            next_subtree: None,
530            last_child: None,
531            kind,
532            #[cfg(feature = "positions")]
533            range,
534        });
535
536        let last_child_id = self.doc.nodes[self.parent_id.get_usize()].last_child;
537        self.doc.nodes[new_child_id.get_usize()].prev_sibling = last_child_id;
538        self.doc.nodes[self.parent_id.get_usize()].last_child = Some(new_child_id);
539
540        for id in &self.awaiting_subtree {
541            self.doc.nodes[id.get_usize()].next_subtree = Some(new_child_id);
542        }
543        self.awaiting_subtree.clear();
544
545        if !appending_element {
546            self.awaiting_subtree
547                .push(NodeId::from(self.doc.nodes.len() - 1));
548        }
549
550        Ok(new_child_id)
551    }
552
553    fn err_pos_at(&self, pos: usize) -> TextPos {
554        self.doc.text_pos_at(pos)
555    }
556}
557
558fn parse(text: &str, opt: ParsingOptions) -> Result<Document> {
559    // Trying to guess rough nodes and attributes amount.
560    let nodes_capacity = text.bytes().filter(|c| *c == b'<').count();
561    let attributes_capacity = text.bytes().filter(|c| *c == b'=').count();
562
563    // Init document.
564    let mut doc = Document {
565        text,
566        nodes: Vec::with_capacity(nodes_capacity),
567        attributes: Vec::with_capacity(attributes_capacity),
568        namespaces: Namespaces::default(),
569    };
570
571    // Add a root node.
572    doc.nodes.push(NodeData {
573        parent: None,
574        prev_sibling: None,
575        next_subtree: None,
576        last_child: None,
577        kind: NodeKind::Root,
578        #[cfg(feature = "positions")]
579        range: 0..text.len(),
580    });
581
582    doc.namespaces
583        .push_ns(Some(NS_XML_PREFIX), StringStorage::Borrowed(NS_XML_URI))?;
584
585    let mut ctx = Context {
586        opt,
587        namespace_start_idx: 1,
588        current_attributes: Vec::with_capacity(16),
589        entities: Vec::new(),
590        awaiting_subtree: Vec::new(),
591        parent_prefixes: Vec::new(),
592        after_text: false,
593        parent_id: NodeId::new(0),
594        tag_name: TagNameSpan::new_null(),
595        loop_detector: LoopDetector::default(),
596        doc,
597    };
598    ctx.parent_prefixes.push("");
599
600    tokenizer::parse(text, opt.allow_dtd, &mut ctx)?;
601
602    let mut doc = ctx.doc;
603    if !doc.root().children().any(|n| n.is_element()) {
604        return Err(Error::NoRootNode);
605    }
606
607    if ctx.parent_prefixes.len() > 1 {
608        return Err(Error::UnclosedRootNode);
609    }
610
611    doc.nodes.shrink_to_fit();
612    doc.attributes.shrink_to_fit();
613    doc.namespaces.shrink_to_fit();
614
615    Ok(doc)
616}
617
618impl<'input> tokenizer::XmlEvents<'input> for Context<'input> {
619    fn token(&mut self, token: tokenizer::Token<'input>) -> Result<()> {
620        match token {
621            tokenizer::Token::ProcessingInstruction(target, value, range) => {
622                let pi = NodeKind::PI(PI { target, value });
623                self.append_node(pi, range)?;
624                self.after_text = false;
625            }
626            tokenizer::Token::Comment(text, range) => {
627                self.append_node(NodeKind::Comment(StringStorage::Borrowed(text)), range)?;
628                self.after_text = false;
629            }
630            tokenizer::Token::EntityDeclaration(name, definition) => {
631                self.entities.push(Entity {
632                    name,
633                    value: definition,
634                });
635            }
636            tokenizer::Token::ElementStart(prefix, local, start) => {
637                if prefix == XMLNS {
638                    let pos = self.err_pos_at(start + 1);
639                    return Err(Error::InvalidElementNamePrefix(pos));
640                }
641
642                self.tag_name = TagNameSpan {
643                    prefix,
644                    name: local,
645                    pos: start,
646                    prefix_pos: start + 1,
647                };
648
649                self.after_text = false;
650            }
651            tokenizer::Token::Attribute(range, qname_len, eq_len, prefix, local, value) => {
652                process_attribute(range, qname_len, eq_len, prefix, local, value, self)?;
653            }
654            tokenizer::Token::ElementEnd(end, range) => {
655                process_element(end, range, self)?;
656                self.after_text = false;
657            }
658            tokenizer::Token::Text(text, range) => {
659                process_text(text, range, self)?;
660            }
661            tokenizer::Token::Cdata(text, range) => {
662                process_cdata(text, range, self)?;
663            }
664        }
665
666        Ok(())
667    }
668}
669
670#[allow(clippy::too_many_arguments)]
671fn process_attribute<'input>(
672    range: Range<usize>,
673    qname_len: u16,
674    eq_len: u8,
675    prefix: &'input str,
676    local: &'input str,
677    value: StrSpan<'input>,
678    ctx: &mut Context<'input>,
679) -> Result<()> {
680    let value = normalize_attribute(value, ctx)?;
681
682    if prefix == XMLNS {
683        // The xmlns namespace MUST NOT be declared as the default namespace.
684        if value.as_str() == NS_XMLNS_URI {
685            let pos = ctx.err_pos_at(range.start);
686            return Err(Error::UnexpectedXmlnsUri(pos));
687        }
688
689        let is_xml_ns_uri = value.as_str() == NS_XML_URI;
690
691        // The prefix 'xml' is by definition bound to the namespace name
692        // http://www.w3.org/XML/1998/namespace.
693        // It MUST NOT be bound to any other namespace name.
694        if local == NS_XML_PREFIX {
695            if !is_xml_ns_uri {
696                let pos = ctx.err_pos_at(range.start);
697                return Err(Error::InvalidXmlPrefixUri(pos));
698            }
699        } else {
700            // The xml namespace MUST NOT be bound to a non-xml prefix.
701            if is_xml_ns_uri {
702                let pos = ctx.err_pos_at(range.start);
703                return Err(Error::UnexpectedXmlUri(pos));
704            }
705        }
706
707        // Check for duplicated namespaces.
708        if ctx
709            .doc
710            .namespaces
711            .exists(ctx.namespace_start_idx, Some(local))
712        {
713            let pos = ctx.err_pos_at(range.start);
714            return Err(Error::DuplicatedNamespace(local.to_string(), pos));
715        }
716
717        // Xml namespace should not be added to the namespaces.
718        if !is_xml_ns_uri {
719            ctx.doc.namespaces.push_ns(Some(local), value)?;
720        }
721    } else if local == XMLNS {
722        // The xml namespace MUST NOT be declared as the default namespace.
723        if value.as_str() == NS_XML_URI {
724            let pos = ctx.err_pos_at(range.start);
725            return Err(Error::UnexpectedXmlUri(pos));
726        }
727
728        // The xmlns namespace MUST NOT be declared as the default namespace.
729        if value.as_str() == NS_XMLNS_URI {
730            let pos = ctx.err_pos_at(range.start);
731            return Err(Error::UnexpectedXmlnsUri(pos));
732        }
733
734        ctx.doc.namespaces.push_ns(None, value)?;
735    } else {
736        ctx.current_attributes.push(TempAttributeData {
737            prefix,
738            local,
739            value,
740            range,
741            qname_len,
742            eq_len,
743        });
744    }
745
746    Ok(())
747}
748
749fn process_element<'input>(
750    end_token: tokenizer::ElementEnd<'input>,
751    token_range: Range<usize>,
752    ctx: &mut Context<'input>,
753) -> Result<()> {
754    if ctx.tag_name.name.is_empty() {
755        // May occur in XML like this:
756        // <!DOCTYPE test [ <!ENTITY p '</p>'> ]>
757        // <root>&p;</root>
758
759        if let tokenizer::ElementEnd::Close(..) = end_token {
760            return Err(Error::UnexpectedEntityCloseTag(
761                ctx.err_pos_at(token_range.start),
762            ));
763        } else {
764            unreachable!("should be already checked by the tokenizer");
765        }
766    }
767
768    let namespaces = ctx.resolve_namespaces();
769    ctx.namespace_start_idx = ctx.doc.namespaces.tree_order.len();
770
771    let attributes = resolve_attributes(namespaces, ctx)?;
772
773    match end_token {
774        tokenizer::ElementEnd::Empty => {
775            let tag_ns_idx = get_ns_idx_by_prefix(
776                namespaces,
777                ctx.tag_name.prefix_pos,
778                ctx.tag_name.prefix,
779                ctx,
780            )?;
781            let new_element_id = ctx.append_node(
782                NodeKind::Element {
783                    tag_name: ExpandedNameIndexed {
784                        namespace_idx: tag_ns_idx,
785                        local_name: ctx.tag_name.name,
786                    },
787                    attributes,
788                    namespaces,
789                },
790                ctx.tag_name.pos..token_range.end,
791            )?;
792            ctx.awaiting_subtree.push(new_element_id);
793        }
794        tokenizer::ElementEnd::Close(prefix, local) => {
795            let parent_node = &mut ctx.doc.nodes[ctx.parent_id.get_usize()];
796            // should never panic as we start with the single prefix of the
797            // root node and always push another one when changing the parent
798            let parent_prefix = *ctx.parent_prefixes.last().unwrap();
799
800            #[cfg(feature = "positions")]
801            {
802                parent_node.range.end = token_range.end;
803            }
804
805            if let NodeKind::Element { ref tag_name, .. } = parent_node.kind {
806                if prefix != parent_prefix || local != tag_name.local_name {
807                    return Err(Error::UnexpectedCloseTag(
808                        gen_qname_string(parent_prefix, tag_name.local_name),
809                        gen_qname_string(prefix, local),
810                        ctx.err_pos_at(token_range.start),
811                    ));
812                }
813            }
814            ctx.awaiting_subtree.push(ctx.parent_id);
815
816            if let Some(id) = parent_node.parent {
817                ctx.parent_id = id;
818                ctx.parent_prefixes.pop();
819                debug_assert!(!ctx.parent_prefixes.is_empty());
820            } else {
821                unreachable!("should be already checked by the tokenizer");
822            }
823        }
824        tokenizer::ElementEnd::Open => {
825            let tag_ns_idx = get_ns_idx_by_prefix(
826                namespaces,
827                ctx.tag_name.prefix_pos,
828                ctx.tag_name.prefix,
829                ctx,
830            )?;
831            ctx.parent_id = ctx.append_node(
832                NodeKind::Element {
833                    tag_name: ExpandedNameIndexed {
834                        namespace_idx: tag_ns_idx,
835                        local_name: ctx.tag_name.name,
836                    },
837                    attributes,
838                    namespaces,
839                },
840                ctx.tag_name.pos..token_range.end,
841            )?;
842            ctx.parent_prefixes.push(ctx.tag_name.prefix);
843        }
844    }
845
846    Ok(())
847}
848
849impl Context<'_> {
850    fn resolve_namespaces(&mut self) -> ShortRange {
851        if let NodeKind::Element { ref namespaces, .. } =
852            self.doc.nodes[self.parent_id.get_usize()].kind
853        {
854            let parent_ns = *namespaces;
855            if self.namespace_start_idx == self.doc.namespaces.tree_order.len() {
856                return parent_ns;
857            }
858
859            for i in parent_ns.to_urange() {
860                if !self.doc.namespaces.exists(
861                    self.namespace_start_idx,
862                    self.doc
863                        .namespaces
864                        .get(self.doc.namespaces.tree_order[i])
865                        .name,
866                ) {
867                    self.doc.namespaces.push_ref(i);
868                }
869            }
870        }
871
872        (self.namespace_start_idx..self.doc.namespaces.tree_order.len()).into()
873    }
874}
875
876fn resolve_attributes(namespaces: ShortRange, ctx: &mut Context) -> Result<ShortRange> {
877    if ctx.current_attributes.is_empty() {
878        return Ok(ShortRange::new(0, 0));
879    }
880
881    if ctx.doc.attributes.len() + ctx.current_attributes.len() >= core::u32::MAX as usize {
882        return Err(Error::AttributesLimitReached);
883    }
884
885    let start_idx = ctx.doc.attributes.len();
886
887    let current_attributes = core::mem::take(&mut ctx.current_attributes);
888    for attr in current_attributes {
889        let namespace_idx = if attr.prefix == NS_XML_PREFIX {
890            // The prefix 'xml' is by definition bound to the namespace name
891            // http://www.w3.org/XML/1998/namespace. This namespace is added
892            // to the document on creation and is always element 0.
893            Some(NamespaceIdx(0))
894        } else if attr.prefix.is_empty() {
895            // 'The namespace name for an unprefixed attribute name
896            // always has no value.'
897            None
898        } else {
899            get_ns_idx_by_prefix(namespaces, attr.range.start, attr.prefix, ctx)?
900        };
901
902        let attr_name = ExpandedNameIndexed {
903            namespace_idx,
904            local_name: attr.local,
905        };
906
907        // Check for duplicated attributes.
908        if ctx.doc.attributes[start_idx..].iter().any(|attr| {
909            attr.name.as_expanded_name(&ctx.doc) == attr_name.as_expanded_name(&ctx.doc)
910        }) {
911            let pos = ctx.err_pos_at(attr.range.start);
912            return Err(Error::DuplicatedAttribute(attr.local.to_string(), pos));
913        }
914
915        ctx.doc.attributes.push(AttributeData {
916            name: attr_name,
917            value: attr.value,
918            #[cfg(feature = "positions")]
919            range: attr.range,
920            #[cfg(feature = "positions")]
921            qname_len: attr.qname_len,
922            #[cfg(feature = "positions")]
923            eq_len: attr.eq_len,
924        });
925    }
926
927    Ok((start_idx..ctx.doc.attributes.len()).into())
928}
929
930fn process_text<'input>(
931    text: &'input str,
932    range: Range<usize>,
933    ctx: &mut Context<'input>,
934) -> Result<()> {
935    // Add text as is if it has only valid characters.
936    if !text.bytes().any(|b| b == b'&' || b == b'\r') {
937        append_text(StringStorage::Borrowed(text), range, ctx)?;
938        ctx.after_text = true;
939        return Ok(());
940    }
941
942    let mut text_buffer = TextBuffer::new();
943    let mut is_as_is = false; // TODO: explain
944    let mut stream = Stream::from_substr(ctx.doc.text, range.clone());
945    while !stream.at_end() {
946        match parse_next_chunk(&mut stream, &ctx.entities)? {
947            NextChunk::Byte(c) => {
948                if is_as_is {
949                    text_buffer.push_raw(c);
950                    is_as_is = false;
951                } else {
952                    text_buffer.push_from_text(c, stream.at_end());
953                }
954            }
955            NextChunk::Char(c) => {
956                for b in CharToBytes::new(c) {
957                    if ctx.loop_detector.depth > 0 {
958                        text_buffer.push_from_text(b, stream.at_end());
959                    } else {
960                        // Characters not from entity should be added as is.
961                        // Not sure why... At least `lxml` produces the same result.
962                        text_buffer.push_raw(b);
963                        is_as_is = true;
964                    }
965                }
966            }
967            NextChunk::Text(fragment) => {
968                is_as_is = false;
969
970                if !text_buffer.is_empty() {
971                    let storage = StringStorage::new_owned(text_buffer.to_str());
972                    append_text(storage, range.clone(), ctx)?;
973                    text_buffer.clear();
974                    ctx.after_text = true;
975                }
976
977                ctx.loop_detector.inc_references(&stream)?;
978                ctx.loop_detector.inc_depth(&stream)?;
979
980                let mut stream = Stream::from_substr(ctx.doc.text, fragment.range());
981                let prev_tag_name = ctx.tag_name;
982                ctx.tag_name = TagNameSpan::new_null();
983                tokenizer::parse_content(&mut stream, ctx)?;
984                ctx.tag_name = prev_tag_name;
985                text_buffer.clear();
986
987                ctx.loop_detector.dec_depth();
988            }
989        }
990    }
991
992    if !text_buffer.is_empty() {
993        append_text(StringStorage::new_owned(text_buffer.finish()), range, ctx)?;
994        ctx.after_text = true;
995    }
996
997    Ok(())
998}
999
1000// While the whole purpose of CDATA is to indicate to an XML library that this text
1001// has to be stored as is, carriage return (`\r`) is still has to be replaced with `\n`.
1002fn process_cdata<'input>(
1003    text: &'input str,
1004    range: Range<usize>,
1005    ctx: &mut Context<'input>,
1006) -> Result<()> {
1007    // Add text as is if it has only valid characters.
1008    if !text.as_bytes().contains(&b'\r') {
1009        append_text(StringStorage::Borrowed(text), range, ctx)?;
1010        ctx.after_text = true;
1011        return Ok(());
1012    }
1013
1014    let mut text_buffer = TextBuffer::new();
1015    let count = text.chars().count();
1016    for (i, c) in text.chars().enumerate() {
1017        for b in CharToBytes::new(c) {
1018            text_buffer.push_from_text(b, i + 1 == count);
1019        }
1020    }
1021
1022    if !text_buffer.is_empty() {
1023        append_text(StringStorage::new_owned(text_buffer.finish()), range, ctx)?;
1024        ctx.after_text = true;
1025    }
1026
1027    Ok(())
1028}
1029
1030fn append_text<'input>(
1031    text: StringStorage<'input>,
1032    range: Range<usize>,
1033    ctx: &mut Context<'input>,
1034) -> Result<()> {
1035    if ctx.after_text {
1036        // Prepend to a previous text node.
1037        if let Some(node) = ctx.doc.nodes.last_mut() {
1038            if let NodeKind::Text(ref mut prev_text) = node.kind {
1039                let text_str = text.as_str();
1040                let prev_text_str = prev_text.as_str();
1041
1042                let mut concat_text = String::with_capacity(text_str.len() + prev_text_str.len());
1043                concat_text.push_str(prev_text_str);
1044                concat_text.push_str(text_str);
1045                *prev_text = StringStorage::new_owned(concat_text);
1046            }
1047        }
1048    } else {
1049        ctx.append_node(NodeKind::Text(text), range)?;
1050    }
1051
1052    Ok(())
1053}
1054
1055enum NextChunk<'a> {
1056    Byte(u8),
1057    Char(char),
1058    Text(StrSpan<'a>),
1059}
1060
1061fn parse_next_chunk<'a>(stream: &mut Stream<'a>, entities: &[Entity<'a>]) -> Result<NextChunk<'a>> {
1062    debug_assert!(!stream.at_end());
1063
1064    // Safe, because we already checked that stream is not at the end.
1065    // But we have an additional `debug_assert` above just in case.
1066    let c = stream.curr_byte_unchecked();
1067
1068    // Check for character/entity references.
1069    if c == b'&' {
1070        let start = stream.pos();
1071        match stream.try_consume_reference() {
1072            Some(Reference::Char(ch)) => Ok(NextChunk::Char(ch)),
1073            Some(Reference::Entity(name)) => entities
1074                .iter()
1075                .find(|e| e.name == name)
1076                .map(|e| NextChunk::Text(e.value))
1077                .ok_or_else(|| {
1078                    let pos = stream.gen_text_pos_from(start);
1079                    Error::UnknownEntityReference(name.into(), pos)
1080                }),
1081            None => {
1082                let pos = stream.gen_text_pos_from(start);
1083                Err(Error::MalformedEntityReference(pos))
1084            }
1085        }
1086    } else {
1087        stream.advance(1);
1088        Ok(NextChunk::Byte(c))
1089    }
1090}
1091
1092// https://www.w3.org/TR/REC-xml/#AVNormalize
1093fn normalize_attribute<'input>(
1094    text: StrSpan<'input>,
1095    ctx: &mut Context<'input>,
1096) -> Result<StringStorage<'input>> {
1097    if is_normalization_required(&text) {
1098        let mut text_buffer = TextBuffer::new();
1099        _normalize_attribute(text, &mut text_buffer, ctx)?;
1100        Ok(StringStorage::new_owned(text_buffer.finish()))
1101    } else {
1102        Ok(StringStorage::Borrowed(text.as_str()))
1103    }
1104}
1105
1106#[inline]
1107fn is_normalization_required(text: &StrSpan) -> bool {
1108    // We assume that `&` indicates an entity or a character reference.
1109    // But in rare cases it can be just an another character.
1110
1111    fn check(c: u8) -> bool {
1112        matches!(c, b'&' | b'\t' | b'\n' | b'\r')
1113    }
1114
1115    text.as_str().bytes().any(check)
1116}
1117
1118fn _normalize_attribute(text: StrSpan, buffer: &mut TextBuffer, ctx: &mut Context) -> Result<()> {
1119    let mut stream = Stream::from_substr(ctx.doc.text, text.range());
1120    while !stream.at_end() {
1121        // Safe, because we already checked that the stream is not at the end.
1122        let c = stream.curr_byte_unchecked();
1123
1124        if c != b'&' {
1125            stream.advance(1);
1126            buffer.push_from_attr(c, stream.curr_byte().ok());
1127            continue;
1128        }
1129
1130        // Check for character/entity references.
1131        let start = stream.pos();
1132        match stream.try_consume_reference() {
1133            Some(Reference::Char(ch)) => {
1134                for b in CharToBytes::new(ch) {
1135                    if ctx.loop_detector.depth > 0 {
1136                        // Escaped `<` inside an ENTITY is an error.
1137                        // Escaped `<` outside an ENTITY is ok.
1138                        if b == b'<' {
1139                            return Err(Error::InvalidAttributeValue(
1140                                stream.gen_text_pos_from(start),
1141                            ));
1142                        }
1143
1144                        buffer.push_from_attr(b, None);
1145                    } else {
1146                        // Characters not from entity should be added as is.
1147                        // Not sure why... At least `lxml` produces the same results.
1148                        buffer.push_raw(b);
1149                    }
1150                }
1151            }
1152            Some(Reference::Entity(name)) => match ctx.entities.iter().find(|e| e.name == name) {
1153                Some(entity) => {
1154                    ctx.loop_detector.inc_references(&stream)?;
1155                    ctx.loop_detector.inc_depth(&stream)?;
1156                    _normalize_attribute(entity.value, buffer, ctx)?;
1157                    ctx.loop_detector.dec_depth();
1158                }
1159                None => {
1160                    let pos = stream.gen_text_pos_from(start);
1161                    return Err(Error::UnknownEntityReference(name.into(), pos));
1162                }
1163            },
1164            None => {
1165                let pos = stream.gen_text_pos_from(start);
1166                return Err(Error::MalformedEntityReference(pos));
1167            }
1168        }
1169    }
1170
1171    Ok(())
1172}
1173
1174fn get_ns_idx_by_prefix(
1175    namespaces: ShortRange,
1176    prefix_pos: usize,
1177    prefix: &str,
1178    ctx: &Context,
1179) -> Result<Option<NamespaceIdx>> {
1180    // Prefix CAN be empty when the default namespace was defined.
1181    //
1182    // Example:
1183    // <e xmlns='http://www.w3.org'/>
1184    let prefix_opt = if prefix.is_empty() {
1185        None
1186    } else {
1187        Some(prefix)
1188    };
1189
1190    let idx = ctx.doc.namespaces.tree_order[namespaces.to_urange()]
1191        .iter()
1192        .find(|idx| ctx.doc.namespaces.get(**idx).name == prefix_opt);
1193
1194    match idx {
1195        Some(idx) => Ok(Some(*idx)),
1196        None => {
1197            if !prefix.is_empty() {
1198                // If an URI was not found and prefix IS NOT empty than
1199                // we have an unknown namespace.
1200                //
1201                // Example:
1202                // <e random:a='b'/>
1203                let pos = ctx.err_pos_at(prefix_pos);
1204                Err(Error::UnknownNamespace(prefix.to_string(), pos))
1205            } else {
1206                // If an URI was not found and prefix IS empty than
1207                // an element or an attribute doesn't have a namespace.
1208                //
1209                // Example:
1210                // <e a='b'/>
1211                Ok(None)
1212            }
1213        }
1214    }
1215}
1216
1217fn gen_qname_string(prefix: &str, local: &str) -> String {
1218    if prefix.is_empty() {
1219        local.to_string()
1220    } else {
1221        alloc::format!("{}:{}", prefix, local)
1222    }
1223}
1224
1225/// Iterate over `char` by `u8`.
1226struct CharToBytes {
1227    buf: [u8; 4],
1228    idx: u8,
1229}
1230
1231impl CharToBytes {
1232    #[inline]
1233    fn new(c: char) -> Self {
1234        let mut buf = [0xFF; 4];
1235        c.encode_utf8(&mut buf);
1236
1237        CharToBytes { buf, idx: 0 }
1238    }
1239}
1240
1241impl Iterator for CharToBytes {
1242    type Item = u8;
1243
1244    #[inline]
1245    fn next(&mut self) -> Option<Self::Item> {
1246        if self.idx < 4 {
1247            let b = self.buf[self.idx as usize];
1248
1249            if b != 0xFF {
1250                self.idx += 1;
1251                return Some(b);
1252            } else {
1253                self.idx = 4;
1254            }
1255        }
1256
1257        None
1258    }
1259}
1260
1261struct TextBuffer {
1262    buffer: Vec<u8>,
1263}
1264
1265impl TextBuffer {
1266    #[inline]
1267    fn new() -> Self {
1268        TextBuffer {
1269            buffer: Vec::with_capacity(32),
1270        }
1271    }
1272
1273    #[inline]
1274    fn push_raw(&mut self, c: u8) {
1275        self.buffer.push(c);
1276    }
1277
1278    fn push_from_attr(&mut self, mut current: u8, next: Option<u8>) {
1279        // \r in \r\n should be ignored.
1280        if current == b'\r' && next == Some(b'\n') {
1281            return;
1282        }
1283
1284        // \n, \r and \t should be converted into spaces.
1285        current = match current {
1286            b'\n' | b'\r' | b'\t' => b' ',
1287            _ => current,
1288        };
1289
1290        self.buffer.push(current);
1291    }
1292
1293    // Translate \r\n and any \r that is not followed by \n into a single \n character.
1294    //
1295    // https://www.w3.org/TR/xml/#sec-line-ends
1296    fn push_from_text(&mut self, c: u8, at_end: bool) {
1297        if self.buffer.last() == Some(&b'\r') {
1298            let idx = self.buffer.len() - 1;
1299            self.buffer[idx] = b'\n';
1300
1301            if at_end && c == b'\r' {
1302                self.buffer.push(b'\n');
1303            } else if c != b'\n' {
1304                self.buffer.push(c);
1305            }
1306        } else if at_end && c == b'\r' {
1307            self.buffer.push(b'\n');
1308        } else {
1309            self.buffer.push(c);
1310        }
1311    }
1312
1313    #[inline]
1314    fn clear(&mut self) {
1315        self.buffer.clear();
1316    }
1317
1318    #[inline]
1319    fn is_empty(&self) -> bool {
1320        self.buffer.is_empty()
1321    }
1322
1323    #[inline]
1324    fn to_str(&self) -> &str {
1325        // `unwrap` is safe, because buffer must contain a valid UTF-8 string.
1326        core::str::from_utf8(&self.buffer).unwrap()
1327    }
1328
1329    #[inline]
1330    fn finish(self) -> String {
1331        // `unwrap` is safe, because buffer must contain a valid UTF-8 string.
1332        String::from_utf8(self.buffer).unwrap()
1333    }
1334}