quick_xml/reader/
state.rs

1#[cfg(feature = "encoding")]
2use encoding_rs::UTF_8;
3
4use crate::encoding::Decoder;
5use crate::errors::{Error, IllFormedError, Result, SyntaxError};
6use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
7#[cfg(feature = "encoding")]
8use crate::reader::EncodingRef;
9use crate::reader::{BangType, Config, ParseState};
10use crate::utils::{is_whitespace, name_len};
11
12/// A struct that holds a current reader state and a parser configuration.
13/// It is independent on a way of reading data: the reader feed data into it and
14/// get back produced [`Event`]s.
15#[derive(Clone, Debug)]
16pub(super) struct ReaderState {
17    /// Number of bytes read from the source of data since the reader was created
18    pub offset: u64,
19    /// A snapshot of an `offset` of the last error returned. It can be less than
20    /// `offset`, because some errors conveniently report at earlier position,
21    /// and changing `offset` is not possible, because `Error::IllFormed` errors
22    /// are recoverable.
23    pub last_error_offset: u64,
24    /// Defines how to process next byte
25    pub state: ParseState,
26    /// User-defined settings that affect parsing
27    pub config: Config,
28    /// All currently Started elements which didn't have a matching
29    /// End element yet.
30    ///
31    /// For an XML
32    ///
33    /// ```xml
34    /// <root><one/><inner attr="value">|<tag></inner></root>
35    /// ```
36    /// when cursor at the `|` position buffer contains:
37    ///
38    /// ```text
39    /// rootinner
40    /// ^   ^
41    /// ```
42    ///
43    /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
44    /// (0 and 4 in that case).
45    opened_buffer: Vec<u8>,
46    /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
47    /// for that field for details
48    opened_starts: Vec<usize>,
49
50    #[cfg(feature = "encoding")]
51    /// Reference to the encoding used to read an XML
52    pub encoding: EncodingRef,
53}
54
55impl ReaderState {
56    /// Trims end whitespaces from `bytes`, if required, and returns a text event.
57    ///
58    /// # Parameters
59    /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
60    pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> BytesText<'b> {
61        let mut content = bytes;
62
63        if self.config.trim_text_end {
64            // Skip the ending '<'
65            let len = bytes
66                .iter()
67                .rposition(|&b| !is_whitespace(b))
68                .map_or(0, |p| p + 1);
69            content = &bytes[..len];
70        }
71        BytesText::wrap(content, self.decoder())
72    }
73
74    /// Returns `Comment`, `CData` or `DocType` event.
75    ///
76    /// `buf` contains data between `<` and `>`:
77    /// - CDATA: `![CDATA[...]]`
78    /// - Comment: `!--...--`
79    /// - Doctype (uppercase): `!D...`
80    /// - Doctype (lowercase): `!d...`
81    pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
82        debug_assert_eq!(
83            buf.first(),
84            Some(&b'!'),
85            "CDATA, comment or DOCTYPE should start from '!'"
86        );
87
88        let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
89            string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
90        };
91
92        let len = buf.len();
93        match bang_type {
94            BangType::Comment if buf.starts_with(b"!--") => {
95                debug_assert!(buf.ends_with(b"--"));
96                if self.config.check_comments {
97                    // search if '--' not in comments
98                    let mut haystack = &buf[3..len - 2];
99                    let mut off = 0;
100                    while let Some(p) = memchr::memchr(b'-', haystack) {
101                        off += p + 1;
102                        // if next byte after `-` is also `-`, return an error
103                        if buf[3 + off] == b'-' {
104                            // Explanation of the magic:
105                            //
106                            // - `self.offset`` just after `>`,
107                            // - `buf` contains `!-- con--tent --`
108                            // - `p` is counted from byte after `<!--`
109                            //
110                            // <!-- con--tent -->:
111                            //  ~~~~~~~~~~~~~~~~ : - buf
112                            //   : ===========   : - zone of search (possible values of `p`)
113                            //   : |---p         : - p is counted from | (| is 0)
114                            //   : :   :         ^ - self.offset
115                            //   ^ :   :           - self.offset - len
116                            //     ^   :           - self.offset - len + 2
117                            //         ^           - self.offset - len + 2 + p
118                            self.last_error_offset = self.offset - len as u64 + 2 + p as u64;
119                            return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment));
120                        }
121                        // Continue search after single `-` (+1 to skip it)
122                        haystack = &haystack[p + 1..];
123                    }
124                }
125                Ok(Event::Comment(BytesText::wrap(
126                    // Cut of `!--` and `--` from start and end
127                    &buf[3..len - 2],
128                    self.decoder(),
129                )))
130            }
131            // XML requires uppercase only:
132            // https://www.w3.org/TR/xml11/#sec-cdata-sect
133            // Even HTML5 required uppercase only:
134            // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
135            BangType::CData if buf.starts_with(b"![CDATA[") => {
136                debug_assert!(buf.ends_with(b"]]"));
137                Ok(Event::CData(BytesCData::wrap(
138                    // Cut of `![CDATA[` and `]]` from start and end
139                    &buf[8..len - 2],
140                    self.decoder(),
141                )))
142            }
143            // XML requires uppercase only, but we will check that on validation stage:
144            // https://www.w3.org/TR/xml11/#sec-prolog-dtd
145            // HTML5 allows mixed case for doctype declarations:
146            // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
147            BangType::DocType(0) if uncased_starts_with(buf, b"!DOCTYPE") => {
148                match buf[8..].iter().position(|&b| !is_whitespace(b)) {
149                    Some(start) => Ok(Event::DocType(BytesText::wrap(
150                        // Cut of `!DOCTYPE` and any number of spaces from start
151                        &buf[8 + start..],
152                        self.decoder(),
153                    ))),
154                    None => {
155                        // Because we here, we at least read `<!DOCTYPE>` and offset after `>`.
156                        // We want report error at place where name is expected - this is just
157                        // before `>`
158                        self.last_error_offset = self.offset - 1;
159                        return Err(Error::IllFormed(IllFormedError::MissingDoctypeName));
160                    }
161                }
162            }
163            _ => {
164                // <!....>
165                //  ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`.
166                // ^------- We report error at that position, so we need to subtract 2 and buf len
167                self.last_error_offset = self.offset - len as u64 - 2;
168                Err(bang_type.to_err().into())
169            }
170        }
171    }
172
173    /// Wraps content of `buf` into the [`Event::End`] event. Does the check that
174    /// end name matches the last opened start name if `self.config.check_end_names` is set.
175    ///
176    /// `buf` contains data between `<` and `>`, for example `/tag`.
177    pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
178        debug_assert_eq!(
179            buf.first(),
180            Some(&b'/'),
181            "closing tag should start from '/'"
182        );
183
184        // Strip the `/` character. `content` contains data between `</` and `>`
185        let content = &buf[1..];
186        // XML standard permits whitespaces after the markup name in closing tags.
187        // Let's strip them from the buffer before comparing tag names.
188        let name = if self.config.trim_markup_names_in_closing_tags {
189            if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) {
190                &content[..pos_end_name + 1]
191            } else {
192                content
193            }
194        } else {
195            content
196        };
197
198        let decoder = self.decoder();
199
200        // Get the index in self.opened_buffer of the name of the last opened tag
201        match self.opened_starts.pop() {
202            Some(start) => {
203                if self.config.check_end_names {
204                    let expected = &self.opened_buffer[start..];
205                    if name != expected {
206                        let expected = decoder.decode(expected).unwrap_or_default().into_owned();
207                        // #513: In order to allow error recovery we should drop content of the buffer
208                        self.opened_buffer.truncate(start);
209
210                        // Report error at start of the end tag at `<` character
211                        // -2 for `<` and `>`
212                        self.last_error_offset = self.offset - buf.len() as u64 - 2;
213                        return Err(Error::IllFormed(IllFormedError::MismatchedEndTag {
214                            expected,
215                            found: decoder.decode(name).unwrap_or_default().into_owned(),
216                        }));
217                    }
218                }
219
220                self.opened_buffer.truncate(start);
221            }
222            None => {
223                if !self.config.allow_unmatched_ends {
224                    // Report error at start of the end tag at `<` character
225                    // -2 for `<` and `>`
226                    self.last_error_offset = self.offset - buf.len() as u64 - 2;
227                    return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag(
228                        decoder.decode(name).unwrap_or_default().into_owned(),
229                    )));
230                }
231            }
232        }
233
234        Ok(Event::End(BytesEnd::wrap(name.into())))
235    }
236
237    /// `buf` contains data between `<` and `>` and the first byte is `?`.
238    /// `self.offset` already after the `>`
239    ///
240    /// Returns `Decl` or `PI` event
241    pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
242        debug_assert!(buf.len() > 0);
243        debug_assert_eq!(buf[0], b'?');
244
245        let len = buf.len();
246        // We accept at least <??>
247        //                     ~~ - len = 2
248        if len > 1 && buf[len - 1] == b'?' {
249            // Cut of `?` and `?` from start and end
250            let content = &buf[1..len - 1];
251            let len = content.len();
252
253            if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) {
254                let event = BytesDecl::from_start(BytesStart::wrap(content, 3));
255
256                // Try getting encoding from the declaration event
257                #[cfg(feature = "encoding")]
258                if self.encoding.can_be_refined() {
259                    if let Some(encoding) = event.encoder() {
260                        self.encoding = EncodingRef::XmlDetected(encoding);
261                    }
262                }
263
264                Ok(Event::Decl(event))
265            } else {
266                Ok(Event::PI(BytesPI::wrap(content, name_len(content))))
267            }
268        } else {
269            // <?....EOF
270            //  ^^^^^ - `buf` does not contains `<`, but we want to report error at `<`,
271            //          so we move offset to it (-2 for `<` and `>`)
272            self.last_error_offset = self.offset - len as u64 - 2;
273            Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl))
274        }
275    }
276
277    /// Converts content of a tag to a `Start` or an `Empty` event
278    ///
279    /// # Parameters
280    /// - `content`: Content of a tag between `<` and `>`
281    pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Event<'b> {
282        if let Some(content) = content.strip_suffix(b"/") {
283            // This is self-closed tag `<something/>`
284            let event = BytesStart::wrap(content, name_len(content));
285
286            if self.config.expand_empty_elements {
287                self.state = ParseState::InsideEmpty;
288                self.opened_starts.push(self.opened_buffer.len());
289                self.opened_buffer.extend(event.name().as_ref());
290                Event::Start(event)
291            } else {
292                Event::Empty(event)
293            }
294        } else {
295            let event = BytesStart::wrap(content, name_len(content));
296
297            // #514: Always store names event when .check_end_names == false,
298            // because checks can be temporary disabled and when they would be
299            // enabled, we should have that information
300            self.opened_starts.push(self.opened_buffer.len());
301            self.opened_buffer.extend(event.name().as_ref());
302            Event::Start(event)
303        }
304    }
305
306    #[inline]
307    pub fn close_expanded_empty(&mut self) -> BytesEnd<'static> {
308        self.state = ParseState::InsideText;
309        let name = self
310            .opened_buffer
311            .split_off(self.opened_starts.pop().unwrap());
312        BytesEnd::wrap(name.into())
313    }
314
315    /// Get the decoder, used to decode bytes, read by this reader, to the strings.
316    ///
317    /// If [`encoding`] feature is enabled, the used encoding may change after
318    /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
319    ///
320    /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
321    /// defaults to UTF-8.
322    ///
323    /// [`encoding`]: ../../index.html#encoding
324    pub const fn decoder(&self) -> Decoder {
325        Decoder {
326            #[cfg(feature = "encoding")]
327            encoding: self.encoding.encoding(),
328        }
329    }
330}
331
332impl Default for ReaderState {
333    fn default() -> Self {
334        Self {
335            offset: 0,
336            last_error_offset: 0,
337            state: ParseState::Init,
338            config: Config::default(),
339            opened_buffer: Vec::new(),
340            opened_starts: Vec::new(),
341
342            #[cfg(feature = "encoding")]
343            encoding: EncodingRef::Implicit(UTF_8),
344        }
345    }
346}