1use alloc::string::{String, ToString};
2use alloc::vec::Vec;
3use core::ops::Range;
4
5use crate::{
6 AttributeData, Document, ExpandedNameIndexed, NamespaceIdx, Namespaces, NodeData, NodeId,
7 NodeKind, ShortRange, StringStorage, TextPos, NS_XMLNS_URI, NS_XML_PREFIX, NS_XML_URI, PI,
8 XMLNS,
9};
10
11use crate::tokenizer::{self, Reference, StrSpan, Stream};
12
13type Result<T> = core::result::Result<T, Error>;
14
15#[derive(Clone, PartialEq, Eq, Hash, Debug)]
17pub enum Error {
18 InvalidXmlPrefixUri(TextPos),
20
21 UnexpectedXmlUri(TextPos),
23
24 UnexpectedXmlnsUri(TextPos),
26
27 InvalidElementNamePrefix(TextPos),
29
30 DuplicatedNamespace(String, TextPos),
32
33 UnknownNamespace(String, TextPos),
39
40 #[allow(missing_docs)]
44 UnexpectedCloseTag(String, String, TextPos),
45
46 UnexpectedEntityCloseTag(TextPos),
54
55 UnknownEntityReference(String, TextPos),
57
58 MalformedEntityReference(TextPos),
63
64 EntityReferenceLoop(TextPos),
68
69 InvalidAttributeValue(TextPos),
71
72 DuplicatedAttribute(String, TextPos),
80
81 NoRootNode,
83
84 UnclosedRootNode,
86
87 UnexpectedDeclaration(TextPos),
90
91 DtdDetected,
95
96 NodesLimitReached,
98
99 AttributesLimitReached,
101
102 NamespacesLimitReached,
104
105 InvalidName(TextPos),
107
108 NonXmlChar(char, TextPos),
112
113 InvalidChar(u8, u8, TextPos),
117
118 InvalidChar2(&'static str, u8, TextPos),
122
123 InvalidString(&'static str, TextPos),
127
128 InvalidExternalID(TextPos),
130
131 InvalidComment(TextPos),
133
134 InvalidCharacterData(TextPos),
138
139 UnknownToken(TextPos),
141
142 UnexpectedEndOfStream,
146}
147
148impl Error {
149 pub fn pos(&self) -> TextPos {
151 match *self {
152 Error::InvalidXmlPrefixUri(pos) => pos,
153 Error::UnexpectedXmlUri(pos) => pos,
154 Error::UnexpectedXmlnsUri(pos) => pos,
155 Error::InvalidElementNamePrefix(pos) => pos,
156 Error::DuplicatedNamespace(_, pos) => pos,
157 Error::UnknownNamespace(_, pos) => pos,
158 Error::UnexpectedCloseTag(_, _, pos) => pos,
159 Error::UnexpectedEntityCloseTag(pos) => pos,
160 Error::UnknownEntityReference(_, pos) => pos,
161 Error::MalformedEntityReference(pos) => pos,
162 Error::EntityReferenceLoop(pos) => pos,
163 Error::InvalidAttributeValue(pos) => pos,
164 Error::DuplicatedAttribute(_, pos) => pos,
165 Error::NoRootNode => TextPos::new(1, 1),
166 Error::UnclosedRootNode => TextPos::new(1, 1),
167 Error::UnexpectedDeclaration(pos) => pos,
168 Error::DtdDetected => TextPos::new(1, 1),
169 Error::NodesLimitReached => TextPos::new(1, 1),
170 Error::AttributesLimitReached => TextPos::new(1, 1),
171 Error::NamespacesLimitReached => TextPos::new(1, 1),
172 Error::InvalidName(pos) => pos,
173 Error::NonXmlChar(_, pos) => pos,
174 Error::InvalidChar(_, _, pos) => pos,
175 Error::InvalidChar2(_, _, pos) => pos,
176 Error::InvalidString(_, pos) => pos,
177 Error::InvalidExternalID(pos) => pos,
178 Error::InvalidComment(pos) => pos,
179 Error::InvalidCharacterData(pos) => pos,
180 Error::UnknownToken(pos) => pos,
181 Error::UnexpectedEndOfStream => TextPos::new(1, 1),
182 }
183 }
184}
185
186impl core::fmt::Display for Error {
187 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
188 match *self {
189 Error::InvalidXmlPrefixUri(pos) => {
190 write!(f, "'xml' namespace prefix mapped to wrong URI at {}", pos)
191 }
192 Error::UnexpectedXmlUri(pos) => {
193 write!(
194 f,
195 "the 'xml' namespace URI is used for not 'xml' prefix at {}",
196 pos
197 )
198 }
199 Error::UnexpectedXmlnsUri(pos) => {
200 write!(
201 f,
202 "the 'xmlns' URI is used at {}, but it must not be declared",
203 pos
204 )
205 }
206 Error::InvalidElementNamePrefix(pos) => {
207 write!(
208 f,
209 "the 'xmlns' prefix is used at {}, but it must not be",
210 pos
211 )
212 }
213 Error::DuplicatedNamespace(ref name, pos) => {
214 write!(f, "namespace '{}' at {} is already defined", name, pos)
215 }
216 Error::UnknownNamespace(ref name, pos) => {
217 write!(f, "an unknown namespace prefix '{}' at {}", name, pos)
218 }
219 Error::UnexpectedCloseTag(ref expected, ref actual, pos) => {
220 write!(
221 f,
222 "expected '{}' tag, not '{}' at {}",
223 expected, actual, pos
224 )
225 }
226 Error::UnexpectedEntityCloseTag(pos) => {
227 write!(f, "unexpected close tag at {}", pos)
228 }
229 Error::MalformedEntityReference(pos) => {
230 write!(f, "malformed entity reference at {}", pos)
231 }
232 Error::UnknownEntityReference(ref name, pos) => {
233 write!(f, "unknown entity reference '{}' at {}", name, pos)
234 }
235 Error::EntityReferenceLoop(pos) => {
236 write!(f, "a possible entity reference loop is detected at {}", pos)
237 }
238 Error::InvalidAttributeValue(pos) => {
239 write!(f, "unescaped '<' found at {}", pos)
240 }
241 Error::DuplicatedAttribute(ref name, pos) => {
242 write!(f, "attribute '{}' at {} is already defined", name, pos)
243 }
244 Error::NoRootNode => {
245 write!(f, "the document does not have a root node")
246 }
247 Error::UnclosedRootNode => {
248 write!(f, "the root node was opened but never closed")
249 }
250 Error::UnexpectedDeclaration(pos) => {
251 write!(f, "unexpected XML declaration at {}", pos)
252 }
253 Error::DtdDetected => {
254 write!(f, "XML with DTD detected")
255 }
256 Error::NodesLimitReached => {
257 write!(f, "nodes limit reached")
258 }
259 Error::AttributesLimitReached => {
260 write!(f, "more than 2^32 attributes were parsed")
261 }
262 Error::NamespacesLimitReached => {
263 write!(f, "more than 2^16 unique namespaces were parsed")
264 }
265 Error::InvalidName(pos) => {
266 write!(f, "invalid name token at {}", pos)
267 }
268 Error::NonXmlChar(c, pos) => {
269 write!(f, "a non-XML character {:?} found at {}", c, pos)
270 }
271 Error::InvalidChar(expected, actual, pos) => {
272 write!(
273 f,
274 "expected '{}' not '{}' at {}",
275 expected as char, actual as char, pos
276 )
277 }
278 Error::InvalidChar2(expected, actual, pos) => {
279 write!(
280 f,
281 "expected {} not '{}' at {}",
282 expected, actual as char, pos
283 )
284 }
285 Error::InvalidString(expected, pos) => {
286 write!(f, "expected '{}' at {}", expected, pos)
287 }
288 Error::InvalidExternalID(pos) => {
289 write!(f, "invalid ExternalID at {}", pos)
290 }
291 Error::InvalidComment(pos) => {
292 write!(f, "comment at {} contains '--'", pos)
293 }
294 Error::InvalidCharacterData(pos) => {
295 write!(f, "']]>' at {} is not allowed inside a character data", pos)
296 }
297 Error::UnknownToken(pos) => {
298 write!(f, "unknown token at {}", pos)
299 }
300 Error::UnexpectedEndOfStream => {
301 write!(f, "unexpected end of stream")
302 }
303 }
304 }
305}
306
307#[cfg(feature = "std")]
308impl std::error::Error for Error {
309 fn description(&self) -> &str {
310 "an XML parsing error"
311 }
312}
313
314#[derive(Clone, Copy, PartialEq, Eq, Debug)]
316pub struct ParsingOptions {
317 pub allow_dtd: bool,
331
332 pub nodes_limit: u32,
338}
339
340#[allow(clippy::derivable_impls)]
342impl Default for ParsingOptions {
343 fn default() -> Self {
344 ParsingOptions {
345 allow_dtd: false,
346 nodes_limit: core::u32::MAX,
347 }
348 }
349}
350
351struct TempAttributeData<'input> {
352 prefix: &'input str,
353 local: &'input str,
354 value: StringStorage<'input>,
355 range: Range<usize>,
356 #[allow(unused)] qname_len: u16,
358 #[allow(unused)] eq_len: u8,
360}
361
362impl<'input> Document<'input> {
363 #[inline]
377 pub fn parse(text: &str) -> Result<Document> {
378 Self::parse_with_options(text, ParsingOptions::default())
379 }
380
381 #[inline]
394 pub fn parse_with_options(text: &str, opt: ParsingOptions) -> Result<Document> {
395 parse(text, opt)
396 }
397}
398
399struct Entity<'input> {
400 name: &'input str,
401 value: StrSpan<'input>,
402}
403
404#[derive(Clone, Copy)]
405struct TagNameSpan<'input> {
406 prefix: &'input str,
407 name: &'input str,
408 pos: usize,
409 prefix_pos: usize,
410}
411
412impl<'input> TagNameSpan<'input> {
413 #[inline]
414 fn new_null() -> Self {
415 Self {
416 prefix: "",
417 name: "",
418 pos: 0,
419 prefix_pos: 0,
420 }
421 }
422}
423
424#[derive(Default)]
454struct LoopDetector {
455 depth: u8,
457 references: u8,
459}
460
461impl LoopDetector {
462 #[inline]
463 fn inc_depth(&mut self, stream: &Stream) -> Result<()> {
464 if self.depth < 10 {
465 self.depth += 1;
466 Ok(())
467 } else {
468 Err(Error::EntityReferenceLoop(stream.gen_text_pos()))
469 }
470 }
471
472 #[inline]
473 fn dec_depth(&mut self) {
474 if self.depth > 0 {
475 self.depth -= 1;
476 }
477
478 if self.depth == 0 {
480 self.references = 0;
481 }
482 }
483
484 #[inline]
485 fn inc_references(&mut self, stream: &Stream) -> Result<()> {
486 if self.depth == 0 {
487 Ok(())
489 } else {
490 if self.references == core::u8::MAX {
491 return Err(Error::EntityReferenceLoop(stream.gen_text_pos()));
492 }
493
494 self.references += 1;
495 Ok(())
496 }
497 }
498}
499
500struct Context<'input> {
501 opt: ParsingOptions,
502 namespace_start_idx: usize,
503 current_attributes: Vec<TempAttributeData<'input>>,
504 awaiting_subtree: Vec<NodeId>,
505 parent_prefixes: Vec<&'input str>,
506 entities: Vec<Entity<'input>>,
507 after_text: bool,
508 parent_id: NodeId,
509 tag_name: TagNameSpan<'input>,
510 loop_detector: LoopDetector,
511 doc: Document<'input>,
512}
513
514impl<'input> Context<'input> {
515 fn append_node(&mut self, kind: NodeKind<'input>, range: Range<usize>) -> Result<NodeId> {
516 if self.doc.nodes.len() >= self.opt.nodes_limit as usize {
517 return Err(Error::NodesLimitReached);
518 }
519
520 #[cfg(not(feature = "positions"))]
521 let _ = range;
522
523 let new_child_id = NodeId::from(self.doc.nodes.len());
524
525 let appending_element = matches!(kind, NodeKind::Element { .. });
526 self.doc.nodes.push(NodeData {
527 parent: Some(self.parent_id),
528 prev_sibling: None,
529 next_subtree: None,
530 last_child: None,
531 kind,
532 #[cfg(feature = "positions")]
533 range,
534 });
535
536 let last_child_id = self.doc.nodes[self.parent_id.get_usize()].last_child;
537 self.doc.nodes[new_child_id.get_usize()].prev_sibling = last_child_id;
538 self.doc.nodes[self.parent_id.get_usize()].last_child = Some(new_child_id);
539
540 for id in &self.awaiting_subtree {
541 self.doc.nodes[id.get_usize()].next_subtree = Some(new_child_id);
542 }
543 self.awaiting_subtree.clear();
544
545 if !appending_element {
546 self.awaiting_subtree
547 .push(NodeId::from(self.doc.nodes.len() - 1));
548 }
549
550 Ok(new_child_id)
551 }
552
553 fn err_pos_at(&self, pos: usize) -> TextPos {
554 self.doc.text_pos_at(pos)
555 }
556}
557
558fn parse(text: &str, opt: ParsingOptions) -> Result<Document> {
559 let nodes_capacity = text.bytes().filter(|c| *c == b'<').count();
561 let attributes_capacity = text.bytes().filter(|c| *c == b'=').count();
562
563 let mut doc = Document {
565 text,
566 nodes: Vec::with_capacity(nodes_capacity),
567 attributes: Vec::with_capacity(attributes_capacity),
568 namespaces: Namespaces::default(),
569 };
570
571 doc.nodes.push(NodeData {
573 parent: None,
574 prev_sibling: None,
575 next_subtree: None,
576 last_child: None,
577 kind: NodeKind::Root,
578 #[cfg(feature = "positions")]
579 range: 0..text.len(),
580 });
581
582 doc.namespaces
583 .push_ns(Some(NS_XML_PREFIX), StringStorage::Borrowed(NS_XML_URI))?;
584
585 let mut ctx = Context {
586 opt,
587 namespace_start_idx: 1,
588 current_attributes: Vec::with_capacity(16),
589 entities: Vec::new(),
590 awaiting_subtree: Vec::new(),
591 parent_prefixes: Vec::new(),
592 after_text: false,
593 parent_id: NodeId::new(0),
594 tag_name: TagNameSpan::new_null(),
595 loop_detector: LoopDetector::default(),
596 doc,
597 };
598 ctx.parent_prefixes.push("");
599
600 tokenizer::parse(text, opt.allow_dtd, &mut ctx)?;
601
602 let mut doc = ctx.doc;
603 if !doc.root().children().any(|n| n.is_element()) {
604 return Err(Error::NoRootNode);
605 }
606
607 if ctx.parent_prefixes.len() > 1 {
608 return Err(Error::UnclosedRootNode);
609 }
610
611 doc.nodes.shrink_to_fit();
612 doc.attributes.shrink_to_fit();
613 doc.namespaces.shrink_to_fit();
614
615 Ok(doc)
616}
617
618impl<'input> tokenizer::XmlEvents<'input> for Context<'input> {
619 fn token(&mut self, token: tokenizer::Token<'input>) -> Result<()> {
620 match token {
621 tokenizer::Token::ProcessingInstruction(target, value, range) => {
622 let pi = NodeKind::PI(PI { target, value });
623 self.append_node(pi, range)?;
624 self.after_text = false;
625 }
626 tokenizer::Token::Comment(text, range) => {
627 self.append_node(NodeKind::Comment(StringStorage::Borrowed(text)), range)?;
628 self.after_text = false;
629 }
630 tokenizer::Token::EntityDeclaration(name, definition) => {
631 self.entities.push(Entity {
632 name,
633 value: definition,
634 });
635 }
636 tokenizer::Token::ElementStart(prefix, local, start) => {
637 if prefix == XMLNS {
638 let pos = self.err_pos_at(start + 1);
639 return Err(Error::InvalidElementNamePrefix(pos));
640 }
641
642 self.tag_name = TagNameSpan {
643 prefix,
644 name: local,
645 pos: start,
646 prefix_pos: start + 1,
647 };
648
649 self.after_text = false;
650 }
651 tokenizer::Token::Attribute(range, qname_len, eq_len, prefix, local, value) => {
652 process_attribute(range, qname_len, eq_len, prefix, local, value, self)?;
653 }
654 tokenizer::Token::ElementEnd(end, range) => {
655 process_element(end, range, self)?;
656 self.after_text = false;
657 }
658 tokenizer::Token::Text(text, range) => {
659 process_text(text, range, self)?;
660 }
661 tokenizer::Token::Cdata(text, range) => {
662 process_cdata(text, range, self)?;
663 }
664 }
665
666 Ok(())
667 }
668}
669
670#[allow(clippy::too_many_arguments)]
671fn process_attribute<'input>(
672 range: Range<usize>,
673 qname_len: u16,
674 eq_len: u8,
675 prefix: &'input str,
676 local: &'input str,
677 value: StrSpan<'input>,
678 ctx: &mut Context<'input>,
679) -> Result<()> {
680 let value = normalize_attribute(value, ctx)?;
681
682 if prefix == XMLNS {
683 if value.as_str() == NS_XMLNS_URI {
685 let pos = ctx.err_pos_at(range.start);
686 return Err(Error::UnexpectedXmlnsUri(pos));
687 }
688
689 let is_xml_ns_uri = value.as_str() == NS_XML_URI;
690
691 if local == NS_XML_PREFIX {
695 if !is_xml_ns_uri {
696 let pos = ctx.err_pos_at(range.start);
697 return Err(Error::InvalidXmlPrefixUri(pos));
698 }
699 } else {
700 if is_xml_ns_uri {
702 let pos = ctx.err_pos_at(range.start);
703 return Err(Error::UnexpectedXmlUri(pos));
704 }
705 }
706
707 if ctx
709 .doc
710 .namespaces
711 .exists(ctx.namespace_start_idx, Some(local))
712 {
713 let pos = ctx.err_pos_at(range.start);
714 return Err(Error::DuplicatedNamespace(local.to_string(), pos));
715 }
716
717 if !is_xml_ns_uri {
719 ctx.doc.namespaces.push_ns(Some(local), value)?;
720 }
721 } else if local == XMLNS {
722 if value.as_str() == NS_XML_URI {
724 let pos = ctx.err_pos_at(range.start);
725 return Err(Error::UnexpectedXmlUri(pos));
726 }
727
728 if value.as_str() == NS_XMLNS_URI {
730 let pos = ctx.err_pos_at(range.start);
731 return Err(Error::UnexpectedXmlnsUri(pos));
732 }
733
734 ctx.doc.namespaces.push_ns(None, value)?;
735 } else {
736 ctx.current_attributes.push(TempAttributeData {
737 prefix,
738 local,
739 value,
740 range,
741 qname_len,
742 eq_len,
743 });
744 }
745
746 Ok(())
747}
748
749fn process_element<'input>(
750 end_token: tokenizer::ElementEnd<'input>,
751 token_range: Range<usize>,
752 ctx: &mut Context<'input>,
753) -> Result<()> {
754 if ctx.tag_name.name.is_empty() {
755 if let tokenizer::ElementEnd::Close(..) = end_token {
760 return Err(Error::UnexpectedEntityCloseTag(
761 ctx.err_pos_at(token_range.start),
762 ));
763 } else {
764 unreachable!("should be already checked by the tokenizer");
765 }
766 }
767
768 let namespaces = ctx.resolve_namespaces();
769 ctx.namespace_start_idx = ctx.doc.namespaces.tree_order.len();
770
771 let attributes = resolve_attributes(namespaces, ctx)?;
772
773 match end_token {
774 tokenizer::ElementEnd::Empty => {
775 let tag_ns_idx = get_ns_idx_by_prefix(
776 namespaces,
777 ctx.tag_name.prefix_pos,
778 ctx.tag_name.prefix,
779 ctx,
780 )?;
781 let new_element_id = ctx.append_node(
782 NodeKind::Element {
783 tag_name: ExpandedNameIndexed {
784 namespace_idx: tag_ns_idx,
785 local_name: ctx.tag_name.name,
786 },
787 attributes,
788 namespaces,
789 },
790 ctx.tag_name.pos..token_range.end,
791 )?;
792 ctx.awaiting_subtree.push(new_element_id);
793 }
794 tokenizer::ElementEnd::Close(prefix, local) => {
795 let parent_node = &mut ctx.doc.nodes[ctx.parent_id.get_usize()];
796 let parent_prefix = *ctx.parent_prefixes.last().unwrap();
799
800 #[cfg(feature = "positions")]
801 {
802 parent_node.range.end = token_range.end;
803 }
804
805 if let NodeKind::Element { ref tag_name, .. } = parent_node.kind {
806 if prefix != parent_prefix || local != tag_name.local_name {
807 return Err(Error::UnexpectedCloseTag(
808 gen_qname_string(parent_prefix, tag_name.local_name),
809 gen_qname_string(prefix, local),
810 ctx.err_pos_at(token_range.start),
811 ));
812 }
813 }
814 ctx.awaiting_subtree.push(ctx.parent_id);
815
816 if let Some(id) = parent_node.parent {
817 ctx.parent_id = id;
818 ctx.parent_prefixes.pop();
819 debug_assert!(!ctx.parent_prefixes.is_empty());
820 } else {
821 unreachable!("should be already checked by the tokenizer");
822 }
823 }
824 tokenizer::ElementEnd::Open => {
825 let tag_ns_idx = get_ns_idx_by_prefix(
826 namespaces,
827 ctx.tag_name.prefix_pos,
828 ctx.tag_name.prefix,
829 ctx,
830 )?;
831 ctx.parent_id = ctx.append_node(
832 NodeKind::Element {
833 tag_name: ExpandedNameIndexed {
834 namespace_idx: tag_ns_idx,
835 local_name: ctx.tag_name.name,
836 },
837 attributes,
838 namespaces,
839 },
840 ctx.tag_name.pos..token_range.end,
841 )?;
842 ctx.parent_prefixes.push(ctx.tag_name.prefix);
843 }
844 }
845
846 Ok(())
847}
848
849impl Context<'_> {
850 fn resolve_namespaces(&mut self) -> ShortRange {
851 if let NodeKind::Element { ref namespaces, .. } =
852 self.doc.nodes[self.parent_id.get_usize()].kind
853 {
854 let parent_ns = *namespaces;
855 if self.namespace_start_idx == self.doc.namespaces.tree_order.len() {
856 return parent_ns;
857 }
858
859 for i in parent_ns.to_urange() {
860 if !self.doc.namespaces.exists(
861 self.namespace_start_idx,
862 self.doc
863 .namespaces
864 .get(self.doc.namespaces.tree_order[i])
865 .name,
866 ) {
867 self.doc.namespaces.push_ref(i);
868 }
869 }
870 }
871
872 (self.namespace_start_idx..self.doc.namespaces.tree_order.len()).into()
873 }
874}
875
876fn resolve_attributes(namespaces: ShortRange, ctx: &mut Context) -> Result<ShortRange> {
877 if ctx.current_attributes.is_empty() {
878 return Ok(ShortRange::new(0, 0));
879 }
880
881 if ctx.doc.attributes.len() + ctx.current_attributes.len() >= core::u32::MAX as usize {
882 return Err(Error::AttributesLimitReached);
883 }
884
885 let start_idx = ctx.doc.attributes.len();
886
887 let current_attributes = core::mem::take(&mut ctx.current_attributes);
888 for attr in current_attributes {
889 let namespace_idx = if attr.prefix == NS_XML_PREFIX {
890 Some(NamespaceIdx(0))
894 } else if attr.prefix.is_empty() {
895 None
898 } else {
899 get_ns_idx_by_prefix(namespaces, attr.range.start, attr.prefix, ctx)?
900 };
901
902 let attr_name = ExpandedNameIndexed {
903 namespace_idx,
904 local_name: attr.local,
905 };
906
907 if ctx.doc.attributes[start_idx..].iter().any(|attr| {
909 attr.name.as_expanded_name(&ctx.doc) == attr_name.as_expanded_name(&ctx.doc)
910 }) {
911 let pos = ctx.err_pos_at(attr.range.start);
912 return Err(Error::DuplicatedAttribute(attr.local.to_string(), pos));
913 }
914
915 ctx.doc.attributes.push(AttributeData {
916 name: attr_name,
917 value: attr.value,
918 #[cfg(feature = "positions")]
919 range: attr.range,
920 #[cfg(feature = "positions")]
921 qname_len: attr.qname_len,
922 #[cfg(feature = "positions")]
923 eq_len: attr.eq_len,
924 });
925 }
926
927 Ok((start_idx..ctx.doc.attributes.len()).into())
928}
929
930fn process_text<'input>(
931 text: &'input str,
932 range: Range<usize>,
933 ctx: &mut Context<'input>,
934) -> Result<()> {
935 if !text.bytes().any(|b| b == b'&' || b == b'\r') {
937 append_text(StringStorage::Borrowed(text), range, ctx)?;
938 ctx.after_text = true;
939 return Ok(());
940 }
941
942 let mut text_buffer = TextBuffer::new();
943 let mut is_as_is = false; let mut stream = Stream::from_substr(ctx.doc.text, range.clone());
945 while !stream.at_end() {
946 match parse_next_chunk(&mut stream, &ctx.entities)? {
947 NextChunk::Byte(c) => {
948 if is_as_is {
949 text_buffer.push_raw(c);
950 is_as_is = false;
951 } else {
952 text_buffer.push_from_text(c, stream.at_end());
953 }
954 }
955 NextChunk::Char(c) => {
956 for b in CharToBytes::new(c) {
957 if ctx.loop_detector.depth > 0 {
958 text_buffer.push_from_text(b, stream.at_end());
959 } else {
960 text_buffer.push_raw(b);
963 is_as_is = true;
964 }
965 }
966 }
967 NextChunk::Text(fragment) => {
968 is_as_is = false;
969
970 if !text_buffer.is_empty() {
971 let storage = StringStorage::new_owned(text_buffer.to_str());
972 append_text(storage, range.clone(), ctx)?;
973 text_buffer.clear();
974 ctx.after_text = true;
975 }
976
977 ctx.loop_detector.inc_references(&stream)?;
978 ctx.loop_detector.inc_depth(&stream)?;
979
980 let mut stream = Stream::from_substr(ctx.doc.text, fragment.range());
981 let prev_tag_name = ctx.tag_name;
982 ctx.tag_name = TagNameSpan::new_null();
983 tokenizer::parse_content(&mut stream, ctx)?;
984 ctx.tag_name = prev_tag_name;
985 text_buffer.clear();
986
987 ctx.loop_detector.dec_depth();
988 }
989 }
990 }
991
992 if !text_buffer.is_empty() {
993 append_text(StringStorage::new_owned(text_buffer.finish()), range, ctx)?;
994 ctx.after_text = true;
995 }
996
997 Ok(())
998}
999
1000fn process_cdata<'input>(
1003 text: &'input str,
1004 range: Range<usize>,
1005 ctx: &mut Context<'input>,
1006) -> Result<()> {
1007 if !text.as_bytes().contains(&b'\r') {
1009 append_text(StringStorage::Borrowed(text), range, ctx)?;
1010 ctx.after_text = true;
1011 return Ok(());
1012 }
1013
1014 let mut text_buffer = TextBuffer::new();
1015 let count = text.chars().count();
1016 for (i, c) in text.chars().enumerate() {
1017 for b in CharToBytes::new(c) {
1018 text_buffer.push_from_text(b, i + 1 == count);
1019 }
1020 }
1021
1022 if !text_buffer.is_empty() {
1023 append_text(StringStorage::new_owned(text_buffer.finish()), range, ctx)?;
1024 ctx.after_text = true;
1025 }
1026
1027 Ok(())
1028}
1029
1030fn append_text<'input>(
1031 text: StringStorage<'input>,
1032 range: Range<usize>,
1033 ctx: &mut Context<'input>,
1034) -> Result<()> {
1035 if ctx.after_text {
1036 if let Some(node) = ctx.doc.nodes.last_mut() {
1038 if let NodeKind::Text(ref mut prev_text) = node.kind {
1039 let text_str = text.as_str();
1040 let prev_text_str = prev_text.as_str();
1041
1042 let mut concat_text = String::with_capacity(text_str.len() + prev_text_str.len());
1043 concat_text.push_str(prev_text_str);
1044 concat_text.push_str(text_str);
1045 *prev_text = StringStorage::new_owned(concat_text);
1046 }
1047 }
1048 } else {
1049 ctx.append_node(NodeKind::Text(text), range)?;
1050 }
1051
1052 Ok(())
1053}
1054
1055enum NextChunk<'a> {
1056 Byte(u8),
1057 Char(char),
1058 Text(StrSpan<'a>),
1059}
1060
1061fn parse_next_chunk<'a>(stream: &mut Stream<'a>, entities: &[Entity<'a>]) -> Result<NextChunk<'a>> {
1062 debug_assert!(!stream.at_end());
1063
1064 let c = stream.curr_byte_unchecked();
1067
1068 if c == b'&' {
1070 let start = stream.pos();
1071 match stream.try_consume_reference() {
1072 Some(Reference::Char(ch)) => Ok(NextChunk::Char(ch)),
1073 Some(Reference::Entity(name)) => entities
1074 .iter()
1075 .find(|e| e.name == name)
1076 .map(|e| NextChunk::Text(e.value))
1077 .ok_or_else(|| {
1078 let pos = stream.gen_text_pos_from(start);
1079 Error::UnknownEntityReference(name.into(), pos)
1080 }),
1081 None => {
1082 let pos = stream.gen_text_pos_from(start);
1083 Err(Error::MalformedEntityReference(pos))
1084 }
1085 }
1086 } else {
1087 stream.advance(1);
1088 Ok(NextChunk::Byte(c))
1089 }
1090}
1091
1092fn normalize_attribute<'input>(
1094 text: StrSpan<'input>,
1095 ctx: &mut Context<'input>,
1096) -> Result<StringStorage<'input>> {
1097 if is_normalization_required(&text) {
1098 let mut text_buffer = TextBuffer::new();
1099 _normalize_attribute(text, &mut text_buffer, ctx)?;
1100 Ok(StringStorage::new_owned(text_buffer.finish()))
1101 } else {
1102 Ok(StringStorage::Borrowed(text.as_str()))
1103 }
1104}
1105
1106#[inline]
1107fn is_normalization_required(text: &StrSpan) -> bool {
1108 fn check(c: u8) -> bool {
1112 matches!(c, b'&' | b'\t' | b'\n' | b'\r')
1113 }
1114
1115 text.as_str().bytes().any(check)
1116}
1117
1118fn _normalize_attribute(text: StrSpan, buffer: &mut TextBuffer, ctx: &mut Context) -> Result<()> {
1119 let mut stream = Stream::from_substr(ctx.doc.text, text.range());
1120 while !stream.at_end() {
1121 let c = stream.curr_byte_unchecked();
1123
1124 if c != b'&' {
1125 stream.advance(1);
1126 buffer.push_from_attr(c, stream.curr_byte().ok());
1127 continue;
1128 }
1129
1130 let start = stream.pos();
1132 match stream.try_consume_reference() {
1133 Some(Reference::Char(ch)) => {
1134 for b in CharToBytes::new(ch) {
1135 if ctx.loop_detector.depth > 0 {
1136 if b == b'<' {
1139 return Err(Error::InvalidAttributeValue(
1140 stream.gen_text_pos_from(start),
1141 ));
1142 }
1143
1144 buffer.push_from_attr(b, None);
1145 } else {
1146 buffer.push_raw(b);
1149 }
1150 }
1151 }
1152 Some(Reference::Entity(name)) => match ctx.entities.iter().find(|e| e.name == name) {
1153 Some(entity) => {
1154 ctx.loop_detector.inc_references(&stream)?;
1155 ctx.loop_detector.inc_depth(&stream)?;
1156 _normalize_attribute(entity.value, buffer, ctx)?;
1157 ctx.loop_detector.dec_depth();
1158 }
1159 None => {
1160 let pos = stream.gen_text_pos_from(start);
1161 return Err(Error::UnknownEntityReference(name.into(), pos));
1162 }
1163 },
1164 None => {
1165 let pos = stream.gen_text_pos_from(start);
1166 return Err(Error::MalformedEntityReference(pos));
1167 }
1168 }
1169 }
1170
1171 Ok(())
1172}
1173
1174fn get_ns_idx_by_prefix(
1175 namespaces: ShortRange,
1176 prefix_pos: usize,
1177 prefix: &str,
1178 ctx: &Context,
1179) -> Result<Option<NamespaceIdx>> {
1180 let prefix_opt = if prefix.is_empty() {
1185 None
1186 } else {
1187 Some(prefix)
1188 };
1189
1190 let idx = ctx.doc.namespaces.tree_order[namespaces.to_urange()]
1191 .iter()
1192 .find(|idx| ctx.doc.namespaces.get(**idx).name == prefix_opt);
1193
1194 match idx {
1195 Some(idx) => Ok(Some(*idx)),
1196 None => {
1197 if !prefix.is_empty() {
1198 let pos = ctx.err_pos_at(prefix_pos);
1204 Err(Error::UnknownNamespace(prefix.to_string(), pos))
1205 } else {
1206 Ok(None)
1212 }
1213 }
1214 }
1215}
1216
1217fn gen_qname_string(prefix: &str, local: &str) -> String {
1218 if prefix.is_empty() {
1219 local.to_string()
1220 } else {
1221 alloc::format!("{}:{}", prefix, local)
1222 }
1223}
1224
1225struct CharToBytes {
1227 buf: [u8; 4],
1228 idx: u8,
1229}
1230
1231impl CharToBytes {
1232 #[inline]
1233 fn new(c: char) -> Self {
1234 let mut buf = [0xFF; 4];
1235 c.encode_utf8(&mut buf);
1236
1237 CharToBytes { buf, idx: 0 }
1238 }
1239}
1240
1241impl Iterator for CharToBytes {
1242 type Item = u8;
1243
1244 #[inline]
1245 fn next(&mut self) -> Option<Self::Item> {
1246 if self.idx < 4 {
1247 let b = self.buf[self.idx as usize];
1248
1249 if b != 0xFF {
1250 self.idx += 1;
1251 return Some(b);
1252 } else {
1253 self.idx = 4;
1254 }
1255 }
1256
1257 None
1258 }
1259}
1260
1261struct TextBuffer {
1262 buffer: Vec<u8>,
1263}
1264
1265impl TextBuffer {
1266 #[inline]
1267 fn new() -> Self {
1268 TextBuffer {
1269 buffer: Vec::with_capacity(32),
1270 }
1271 }
1272
1273 #[inline]
1274 fn push_raw(&mut self, c: u8) {
1275 self.buffer.push(c);
1276 }
1277
1278 fn push_from_attr(&mut self, mut current: u8, next: Option<u8>) {
1279 if current == b'\r' && next == Some(b'\n') {
1281 return;
1282 }
1283
1284 current = match current {
1286 b'\n' | b'\r' | b'\t' => b' ',
1287 _ => current,
1288 };
1289
1290 self.buffer.push(current);
1291 }
1292
1293 fn push_from_text(&mut self, c: u8, at_end: bool) {
1297 if self.buffer.last() == Some(&b'\r') {
1298 let idx = self.buffer.len() - 1;
1299 self.buffer[idx] = b'\n';
1300
1301 if at_end && c == b'\r' {
1302 self.buffer.push(b'\n');
1303 } else if c != b'\n' {
1304 self.buffer.push(c);
1305 }
1306 } else if at_end && c == b'\r' {
1307 self.buffer.push(b'\n');
1308 } else {
1309 self.buffer.push(c);
1310 }
1311 }
1312
1313 #[inline]
1314 fn clear(&mut self) {
1315 self.buffer.clear();
1316 }
1317
1318 #[inline]
1319 fn is_empty(&self) -> bool {
1320 self.buffer.is_empty()
1321 }
1322
1323 #[inline]
1324 fn to_str(&self) -> &str {
1325 core::str::from_utf8(&self.buffer).unwrap()
1327 }
1328
1329 #[inline]
1330 fn finish(self) -> String {
1331 String::from_utf8(self.buffer).unwrap()
1333 }
1334}