quick_xml/reader/state.rs
1#[cfg(feature = "encoding")]
2use encoding_rs::UTF_8;
3
4use crate::encoding::Decoder;
5use crate::errors::{Error, IllFormedError, Result, SyntaxError};
6use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
7#[cfg(feature = "encoding")]
8use crate::reader::EncodingRef;
9use crate::reader::{BangType, Config, ParseState};
10use crate::utils::{is_whitespace, name_len};
11
12/// A struct that holds a current reader state and a parser configuration.
13/// It is independent on a way of reading data: the reader feed data into it and
14/// get back produced [`Event`]s.
15#[derive(Clone, Debug)]
16pub(super) struct ReaderState {
17 /// Number of bytes read from the source of data since the reader was created
18 pub offset: u64,
19 /// A snapshot of an `offset` of the last error returned. It can be less than
20 /// `offset`, because some errors conveniently report at earlier position,
21 /// and changing `offset` is not possible, because `Error::IllFormed` errors
22 /// are recoverable.
23 pub last_error_offset: u64,
24 /// Defines how to process next byte
25 pub state: ParseState,
26 /// User-defined settings that affect parsing
27 pub config: Config,
28 /// All currently Started elements which didn't have a matching
29 /// End element yet.
30 ///
31 /// For an XML
32 ///
33 /// ```xml
34 /// <root><one/><inner attr="value">|<tag></inner></root>
35 /// ```
36 /// when cursor at the `|` position buffer contains:
37 ///
38 /// ```text
39 /// rootinner
40 /// ^ ^
41 /// ```
42 ///
43 /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
44 /// (0 and 4 in that case).
45 opened_buffer: Vec<u8>,
46 /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
47 /// for that field for details
48 opened_starts: Vec<usize>,
49
50 #[cfg(feature = "encoding")]
51 /// Reference to the encoding used to read an XML
52 pub encoding: EncodingRef,
53}
54
55impl ReaderState {
56 /// Trims end whitespaces from `bytes`, if required, and returns a text event.
57 ///
58 /// # Parameters
59 /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
60 pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> BytesText<'b> {
61 let mut content = bytes;
62
63 if self.config.trim_text_end {
64 // Skip the ending '<'
65 let len = bytes
66 .iter()
67 .rposition(|&b| !is_whitespace(b))
68 .map_or(0, |p| p + 1);
69 content = &bytes[..len];
70 }
71 BytesText::wrap(content, self.decoder())
72 }
73
74 /// Returns `Comment`, `CData` or `DocType` event.
75 ///
76 /// `buf` contains data between `<` and `>`:
77 /// - CDATA: `![CDATA[...]]`
78 /// - Comment: `!--...--`
79 /// - Doctype (uppercase): `!D...`
80 /// - Doctype (lowercase): `!d...`
81 pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
82 debug_assert_eq!(
83 buf.first(),
84 Some(&b'!'),
85 "CDATA, comment or DOCTYPE should start from '!'"
86 );
87
88 let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
89 string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
90 };
91
92 let len = buf.len();
93 match bang_type {
94 BangType::Comment if buf.starts_with(b"!--") => {
95 debug_assert!(buf.ends_with(b"--"));
96 if self.config.check_comments {
97 // search if '--' not in comments
98 let mut haystack = &buf[3..len - 2];
99 let mut off = 0;
100 while let Some(p) = memchr::memchr(b'-', haystack) {
101 off += p + 1;
102 // if next byte after `-` is also `-`, return an error
103 if buf[3 + off] == b'-' {
104 // Explanation of the magic:
105 //
106 // - `self.offset`` just after `>`,
107 // - `buf` contains `!-- con--tent --`
108 // - `p` is counted from byte after `<!--`
109 //
110 // <!-- con--tent -->:
111 // ~~~~~~~~~~~~~~~~ : - buf
112 // : =========== : - zone of search (possible values of `p`)
113 // : |---p : - p is counted from | (| is 0)
114 // : : : ^ - self.offset
115 // ^ : : - self.offset - len
116 // ^ : - self.offset - len + 2
117 // ^ - self.offset - len + 2 + p
118 self.last_error_offset = self.offset - len as u64 + 2 + p as u64;
119 return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment));
120 }
121 // Continue search after single `-` (+1 to skip it)
122 haystack = &haystack[p + 1..];
123 }
124 }
125 Ok(Event::Comment(BytesText::wrap(
126 // Cut of `!--` and `--` from start and end
127 &buf[3..len - 2],
128 self.decoder(),
129 )))
130 }
131 // XML requires uppercase only:
132 // https://www.w3.org/TR/xml11/#sec-cdata-sect
133 // Even HTML5 required uppercase only:
134 // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
135 BangType::CData if buf.starts_with(b"![CDATA[") => {
136 debug_assert!(buf.ends_with(b"]]"));
137 Ok(Event::CData(BytesCData::wrap(
138 // Cut of `![CDATA[` and `]]` from start and end
139 &buf[8..len - 2],
140 self.decoder(),
141 )))
142 }
143 // XML requires uppercase only, but we will check that on validation stage:
144 // https://www.w3.org/TR/xml11/#sec-prolog-dtd
145 // HTML5 allows mixed case for doctype declarations:
146 // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
147 BangType::DocType(0) if uncased_starts_with(buf, b"!DOCTYPE") => {
148 match buf[8..].iter().position(|&b| !is_whitespace(b)) {
149 Some(start) => Ok(Event::DocType(BytesText::wrap(
150 // Cut of `!DOCTYPE` and any number of spaces from start
151 &buf[8 + start..],
152 self.decoder(),
153 ))),
154 None => {
155 // Because we here, we at least read `<!DOCTYPE>` and offset after `>`.
156 // We want report error at place where name is expected - this is just
157 // before `>`
158 self.last_error_offset = self.offset - 1;
159 return Err(Error::IllFormed(IllFormedError::MissingDoctypeName));
160 }
161 }
162 }
163 _ => {
164 // <!....>
165 // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`.
166 // ^------- We report error at that position, so we need to subtract 2 and buf len
167 self.last_error_offset = self.offset - len as u64 - 2;
168 Err(bang_type.to_err().into())
169 }
170 }
171 }
172
173 /// Wraps content of `buf` into the [`Event::End`] event. Does the check that
174 /// end name matches the last opened start name if `self.config.check_end_names` is set.
175 ///
176 /// `buf` contains data between `<` and `>`, for example `/tag`.
177 pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
178 debug_assert_eq!(
179 buf.first(),
180 Some(&b'/'),
181 "closing tag should start from '/'"
182 );
183
184 // Strip the `/` character. `content` contains data between `</` and `>`
185 let content = &buf[1..];
186 // XML standard permits whitespaces after the markup name in closing tags.
187 // Let's strip them from the buffer before comparing tag names.
188 let name = if self.config.trim_markup_names_in_closing_tags {
189 if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) {
190 &content[..pos_end_name + 1]
191 } else {
192 content
193 }
194 } else {
195 content
196 };
197
198 let decoder = self.decoder();
199
200 // Get the index in self.opened_buffer of the name of the last opened tag
201 match self.opened_starts.pop() {
202 Some(start) => {
203 if self.config.check_end_names {
204 let expected = &self.opened_buffer[start..];
205 if name != expected {
206 let expected = decoder.decode(expected).unwrap_or_default().into_owned();
207 // #513: In order to allow error recovery we should drop content of the buffer
208 self.opened_buffer.truncate(start);
209
210 // Report error at start of the end tag at `<` character
211 // -2 for `<` and `>`
212 self.last_error_offset = self.offset - buf.len() as u64 - 2;
213 return Err(Error::IllFormed(IllFormedError::MismatchedEndTag {
214 expected,
215 found: decoder.decode(name).unwrap_or_default().into_owned(),
216 }));
217 }
218 }
219
220 self.opened_buffer.truncate(start);
221 }
222 None => {
223 if !self.config.allow_unmatched_ends {
224 // Report error at start of the end tag at `<` character
225 // -2 for `<` and `>`
226 self.last_error_offset = self.offset - buf.len() as u64 - 2;
227 return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag(
228 decoder.decode(name).unwrap_or_default().into_owned(),
229 )));
230 }
231 }
232 }
233
234 Ok(Event::End(BytesEnd::wrap(name.into())))
235 }
236
237 /// `buf` contains data between `<` and `>` and the first byte is `?`.
238 /// `self.offset` already after the `>`
239 ///
240 /// Returns `Decl` or `PI` event
241 pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
242 debug_assert!(buf.len() > 0);
243 debug_assert_eq!(buf[0], b'?');
244
245 let len = buf.len();
246 // We accept at least <??>
247 // ~~ - len = 2
248 if len > 1 && buf[len - 1] == b'?' {
249 // Cut of `?` and `?` from start and end
250 let content = &buf[1..len - 1];
251 let len = content.len();
252
253 if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) {
254 let event = BytesDecl::from_start(BytesStart::wrap(content, 3));
255
256 // Try getting encoding from the declaration event
257 #[cfg(feature = "encoding")]
258 if self.encoding.can_be_refined() {
259 if let Some(encoding) = event.encoder() {
260 self.encoding = EncodingRef::XmlDetected(encoding);
261 }
262 }
263
264 Ok(Event::Decl(event))
265 } else {
266 Ok(Event::PI(BytesPI::wrap(content, name_len(content))))
267 }
268 } else {
269 // <?....EOF
270 // ^^^^^ - `buf` does not contains `<`, but we want to report error at `<`,
271 // so we move offset to it (-2 for `<` and `>`)
272 self.last_error_offset = self.offset - len as u64 - 2;
273 Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl))
274 }
275 }
276
277 /// Converts content of a tag to a `Start` or an `Empty` event
278 ///
279 /// # Parameters
280 /// - `content`: Content of a tag between `<` and `>`
281 pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Event<'b> {
282 if let Some(content) = content.strip_suffix(b"/") {
283 // This is self-closed tag `<something/>`
284 let event = BytesStart::wrap(content, name_len(content));
285
286 if self.config.expand_empty_elements {
287 self.state = ParseState::InsideEmpty;
288 self.opened_starts.push(self.opened_buffer.len());
289 self.opened_buffer.extend(event.name().as_ref());
290 Event::Start(event)
291 } else {
292 Event::Empty(event)
293 }
294 } else {
295 let event = BytesStart::wrap(content, name_len(content));
296
297 // #514: Always store names event when .check_end_names == false,
298 // because checks can be temporary disabled and when they would be
299 // enabled, we should have that information
300 self.opened_starts.push(self.opened_buffer.len());
301 self.opened_buffer.extend(event.name().as_ref());
302 Event::Start(event)
303 }
304 }
305
306 #[inline]
307 pub fn close_expanded_empty(&mut self) -> BytesEnd<'static> {
308 self.state = ParseState::InsideText;
309 let name = self
310 .opened_buffer
311 .split_off(self.opened_starts.pop().unwrap());
312 BytesEnd::wrap(name.into())
313 }
314
315 /// Get the decoder, used to decode bytes, read by this reader, to the strings.
316 ///
317 /// If [`encoding`] feature is enabled, the used encoding may change after
318 /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
319 ///
320 /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
321 /// defaults to UTF-8.
322 ///
323 /// [`encoding`]: ../../index.html#encoding
324 pub const fn decoder(&self) -> Decoder {
325 Decoder {
326 #[cfg(feature = "encoding")]
327 encoding: self.encoding.encoding(),
328 }
329 }
330}
331
332impl Default for ReaderState {
333 fn default() -> Self {
334 Self {
335 offset: 0,
336 last_error_offset: 0,
337 state: ParseState::Init,
338 config: Config::default(),
339 opened_buffer: Vec::new(),
340 opened_starts: Vec::new(),
341
342 #[cfg(feature = "encoding")]
343 encoding: EncodingRef::Implicit(UTF_8),
344 }
345 }
346}