icu_normalizer/
lib.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
6#![cfg_attr(not(any(test, doc)), no_std)]
7#![cfg_attr(
8    not(test),
9    deny(
10        clippy::indexing_slicing,
11        clippy::unwrap_used,
12        clippy::expect_used,
13        clippy::panic,
14        clippy::exhaustive_structs,
15        clippy::exhaustive_enums,
16        clippy::trivially_copy_pass_by_ref,
17        missing_debug_implementations,
18    )
19)]
20#![warn(missing_docs)]
21
22//! Normalizing text into Unicode Normalization Forms.
23//!
24//! This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/))
25//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
26//!
27//! # Functionality
28//!
29//! The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode
30//! Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD.
31//!
32//! Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8,
33//! and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator.
34//!
35//! The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA
36//! Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by
37//! applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the
38//! [`idna`](https://docs.rs/idna/latest/idna/) crate.
39//!
40//! The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and
41//! the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class
42//! property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/) via the
43//! [`icu_harfbuzz`](https://docs.rs/icu_harfbuzz/latest/icu_harfbuzz/) crate.
44//!
45//! Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in
46//! addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive
47//! non-“maybe” answer.
48//!
49//! # Examples
50//!
51//! ```
52//! let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc();
53//! assert_eq!(nfc.normalize("a\u{0308}"), "ä");
54//! assert!(nfc.is_normalized("ä"));
55//!
56//! let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd();
57//! assert_eq!(nfd.normalize("ä"), "a\u{0308}");
58//! assert!(!nfd.is_normalized("ä"));
59//! ```
60
61extern crate alloc;
62
63// We don't depend on icu_properties to minimize deps, but we want to be able
64// to ensure we're using the right CCC values
65macro_rules! ccc {
66    ($name:ident, $num:expr) => {
67        const {
68            #[cfg(feature = "icu_properties")]
69            if icu_properties::props::CanonicalCombiningClass::$name.to_icu4c_value() != $num {
70                panic!("icu_normalizer has incorrect ccc values")
71            }
72            CanonicalCombiningClass::from_icu4c_value($num)
73        }
74    };
75}
76
77pub mod properties;
78pub mod provider;
79pub mod uts46;
80
81use crate::provider::CanonicalCompositions;
82use crate::provider::DecompositionData;
83use crate::provider::NormalizerNfdDataV1;
84use crate::provider::NormalizerNfkdDataV1;
85use crate::provider::NormalizerUts46DataV1;
86use alloc::borrow::Cow;
87use alloc::string::String;
88use core::char::REPLACEMENT_CHARACTER;
89use icu_collections::char16trie::Char16Trie;
90use icu_collections::char16trie::Char16TrieIterator;
91use icu_collections::char16trie::TrieResult;
92use icu_collections::codepointtrie::CodePointTrie;
93#[cfg(feature = "icu_properties")]
94use icu_properties::props::CanonicalCombiningClass;
95use icu_provider::prelude::*;
96use provider::DecompositionTables;
97use provider::NormalizerNfcV1;
98use provider::NormalizerNfdTablesV1;
99use provider::NormalizerNfkdTablesV1;
100use smallvec::SmallVec;
101#[cfg(feature = "utf16_iter")]
102use utf16_iter::Utf16CharsEx;
103#[cfg(feature = "utf8_iter")]
104use utf8_iter::Utf8CharsEx;
105use zerovec::{zeroslice, ZeroSlice};
106
107/// This type exists as a shim for icu_properties CanonicalCombiningClass when the crate is disabled
108/// It should not be exposed to users.
109#[cfg(not(feature = "icu_properties"))]
110#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
111struct CanonicalCombiningClass(pub(crate) u8);
112
113#[cfg(not(feature = "icu_properties"))]
114impl CanonicalCombiningClass {
115    const fn from_icu4c_value(v: u8) -> Self {
116        Self(v)
117    }
118    const fn to_icu4c_value(self) -> u8 {
119        self.0
120    }
121}
122
123const CCC_NOT_REORDERED: CanonicalCombiningClass = ccc!(NotReordered, 0);
124const CCC_ABOVE: CanonicalCombiningClass = ccc!(Above, 230);
125
126/// Treatment of the ignorable marker (0xFFFFFFFF) in data.
127#[derive(Debug, PartialEq, Eq)]
128enum IgnorableBehavior {
129    /// 0xFFFFFFFF in data is not supported.
130    Unsupported,
131    /// Ignorables are ignored.
132    Ignored,
133    /// Ignorables are treated as singleton decompositions
134    /// to the REPLACEMENT CHARACTER.
135    ReplacementCharacter,
136}
137
138/// Marker for UTS 46 ignorables.
139///
140/// See trie-value-format.md
141const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;
142
143/// Marker that the decomposition does not round trip via NFC.
144///
145/// See trie-value-format.md
146const NON_ROUND_TRIP_MARKER: u32 = 1 << 30;
147
148/// Marker that the first character of the decomposition
149/// can combine backwards.
150///
151/// See trie-value-format.md
152const BACKWARD_COMBINING_MARKER: u32 = 1 << 31;
153
154/// Mask for the bits have to be zero for this to be a BMP
155/// singleton decomposition, or value baked into the surrogate
156/// range.
157///
158/// See trie-value-format.md
159const HIGH_ZEROS_MASK: u32 = 0x3FFF0000;
160
161/// Mask for the bits have to be zero for this to be a complex
162/// decomposition.
163///
164/// See trie-value-format.md
165const LOW_ZEROS_MASK: u32 = 0xFFE0;
166
167/// Checks if a trie value carries a (non-zero) canonical
168/// combining class.
169///
170/// See trie-value-format.md
171fn trie_value_has_ccc(trie_value: u32) -> bool {
172    (trie_value & 0x3FFFFE00) == 0xD800
173}
174
175/// Checks if the trie signifies a special non-starter decomposition.
176///
177/// See trie-value-format.md
178fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool {
179    (trie_value & 0x3FFFFF00) == 0xD900
180}
181
182/// Checks if a trie value signifies a character whose decomposition
183/// starts with a non-starter.
184///
185/// See trie-value-format.md
186fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
187    trie_value_has_ccc(trie_value)
188}
189
190/// Extracts a canonical combining class (possibly zero) from a trie value.
191///
192/// See trie-value-format.md
193fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
194    if trie_value_has_ccc(trie_value) {
195        CanonicalCombiningClass::from_icu4c_value(trie_value as u8)
196    } else {
197        CCC_NOT_REORDERED
198    }
199}
200
201/// The tail (everything after the first character) of the NFKD form U+FDFA
202/// as 16-bit units.
203static FDFA_NFKD: [u16; 17] = [
204    0x644, 0x649, 0x20, 0x627, 0x644, 0x644, 0x647, 0x20, 0x639, 0x644, 0x64A, 0x647, 0x20, 0x648,
205    0x633, 0x644, 0x645,
206];
207
208/// Marker value for U+FDFA in NFKD. (Unified with Hangul syllable marker,
209/// but they differ by `NON_ROUND_TRIP_MARKER`.)
210///
211/// See trie-value-format.md
212const FDFA_MARKER: u16 = 1;
213
214// These constants originate from page 143 of Unicode 14.0
215/// Syllable base
216const HANGUL_S_BASE: u32 = 0xAC00;
217/// Lead jamo base
218const HANGUL_L_BASE: u32 = 0x1100;
219/// Vowel jamo base
220const HANGUL_V_BASE: u32 = 0x1161;
221/// Trail jamo base (deliberately off by one to account for the absence of a trail)
222const HANGUL_T_BASE: u32 = 0x11A7;
223/// Lead jamo count
224const HANGUL_L_COUNT: u32 = 19;
225/// Vowel jamo count
226const HANGUL_V_COUNT: u32 = 21;
227/// Trail jamo count (deliberately off by one to account for the absence of a trail)
228const HANGUL_T_COUNT: u32 = 28;
229/// Vowel jamo count times trail jamo count
230const HANGUL_N_COUNT: u32 = 588;
231/// Syllable count
232const HANGUL_S_COUNT: u32 = 11172;
233
234/// One past the conjoining jamo block
235const HANGUL_JAMO_LIMIT: u32 = 0x1200;
236
237/// If `opt` is `Some`, unwrap it. If `None`, panic if debug assertions
238/// are enabled and return `default` if debug assertions are not enabled.
239///
240/// Use this only if the only reason why `opt` could be `None` is bogus
241/// data from the provider.
242#[inline(always)]
243fn unwrap_or_gigo<T>(opt: Option<T>, default: T) -> T {
244    if let Some(val) = opt {
245        val
246    } else {
247        // GIGO case
248        debug_assert!(false);
249        default
250    }
251}
252
253/// Convert a `u32` _obtained from data provider data_ to `char`.
254#[inline(always)]
255fn char_from_u32(u: u32) -> char {
256    unwrap_or_gigo(core::char::from_u32(u), REPLACEMENT_CHARACTER)
257}
258
259/// Convert a `u16` _obtained from data provider data_ to `char`.
260#[inline(always)]
261fn char_from_u16(u: u16) -> char {
262    char_from_u32(u32::from(u))
263}
264
265const EMPTY_U16: &ZeroSlice<u16> = zeroslice![];
266
267const EMPTY_CHAR: &ZeroSlice<char> = zeroslice![];
268
269#[inline(always)]
270fn in_inclusive_range(c: char, start: char, end: char) -> bool {
271    u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
272}
273
274#[inline(always)]
275#[cfg(feature = "utf16_iter")]
276fn in_inclusive_range16(u: u16, start: u16, end: u16) -> bool {
277    u.wrapping_sub(start) <= (end - start)
278}
279
280/// Performs canonical composition (including Hangul) on a pair of
281/// characters or returns `None` if these characters don't compose.
282/// Composition exclusions are taken into account.
283#[inline]
284fn compose(iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
285    let v = u32::from(second).wrapping_sub(HANGUL_V_BASE);
286    if v >= HANGUL_JAMO_LIMIT - HANGUL_V_BASE {
287        return compose_non_hangul(iter, starter, second);
288    }
289    if v < HANGUL_V_COUNT {
290        let l = u32::from(starter).wrapping_sub(HANGUL_L_BASE);
291        if l < HANGUL_L_COUNT {
292            let lv = l * HANGUL_N_COUNT + v * HANGUL_T_COUNT;
293            // Safe, because the inputs are known to be in range.
294            return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) });
295        }
296        return None;
297    }
298    if in_inclusive_range(second, '\u{11A8}', '\u{11C2}') {
299        let lv = u32::from(starter).wrapping_sub(HANGUL_S_BASE);
300        if lv < HANGUL_S_COUNT && lv % HANGUL_T_COUNT == 0 {
301            let lvt = lv + (u32::from(second) - HANGUL_T_BASE);
302            // Safe, because the inputs are known to be in range.
303            return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lvt) });
304        }
305    }
306    None
307}
308
309/// Performs (non-Hangul) canonical composition on a pair of characters
310/// or returns `None` if these characters don't compose. Composition
311/// exclusions are taken into account.
312fn compose_non_hangul(mut iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
313    // To make the trie smaller, the pairs are stored second character first.
314    // Given how this method is used in ways where it's known that `second`
315    // is or isn't a starter. We could potentially split the trie into two
316    // tries depending on whether `second` is a starter.
317    match iter.next(second) {
318        TrieResult::NoMatch => None,
319        TrieResult::NoValue => match iter.next(starter) {
320            TrieResult::NoMatch => None,
321            TrieResult::FinalValue(i) => {
322                if let Some(c) = char::from_u32(i as u32) {
323                    Some(c)
324                } else {
325                    // GIGO case
326                    debug_assert!(false);
327                    None
328                }
329            }
330            TrieResult::NoValue | TrieResult::Intermediate(_) => {
331                // GIGO case
332                debug_assert!(false);
333                None
334            }
335        },
336        TrieResult::FinalValue(_) | TrieResult::Intermediate(_) => {
337            // GIGO case
338            debug_assert!(false);
339            None
340        }
341    }
342}
343
344/// See trie-value-format.md
345#[inline(always)]
346fn starter_and_decomposes_to_self_impl(trie_val: u32) -> bool {
347    // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
348    // and this function needs to ignore that.
349    (trie_val & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0
350}
351
352/// See trie-value-format.md
353#[inline(always)]
354fn potential_passthrough_and_cannot_combine_backwards_impl(trie_val: u32) -> bool {
355    (trie_val & (NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER)) == 0
356}
357
358/// Struct for holding together a character and the value
359/// looked up for it from the NFD trie in a more explicit
360/// way than an anonymous pair.
361/// Also holds a flag about the supplementary-trie provenance.
362#[derive(Debug, PartialEq, Eq)]
363struct CharacterAndTrieValue {
364    character: char,
365    /// See trie-value-format.md
366    trie_val: u32,
367}
368
369impl CharacterAndTrieValue {
370    #[inline(always)]
371    pub fn new(c: char, trie_value: u32) -> Self {
372        CharacterAndTrieValue {
373            character: c,
374            trie_val: trie_value,
375        }
376    }
377
378    #[inline(always)]
379    pub fn starter_and_decomposes_to_self(&self) -> bool {
380        starter_and_decomposes_to_self_impl(self.trie_val)
381    }
382
383    /// See trie-value-format.md
384    #[inline(always)]
385    #[cfg(feature = "utf8_iter")]
386    pub fn starter_and_decomposes_to_self_except_replacement(&self) -> bool {
387        // This intentionally leaves `NON_ROUND_TRIP_MARKER` in the value
388        // to be compared with zero. U+FFFD has that flag set despite really
389        // being being round-tripping in order to make UTF-8 errors
390        // ineligible for passthrough.
391        (self.trie_val & !BACKWARD_COMBINING_MARKER) == 0
392    }
393
394    /// See trie-value-format.md
395    #[inline(always)]
396    pub fn can_combine_backwards(&self) -> bool {
397        (self.trie_val & BACKWARD_COMBINING_MARKER) != 0
398    }
399    /// See trie-value-format.md
400    #[inline(always)]
401    pub fn potential_passthrough(&self) -> bool {
402        (self.trie_val & NON_ROUND_TRIP_MARKER) == 0
403    }
404    /// See trie-value-format.md
405    #[inline(always)]
406    pub fn potential_passthrough_and_cannot_combine_backwards(&self) -> bool {
407        potential_passthrough_and_cannot_combine_backwards_impl(self.trie_val)
408    }
409}
410
411/// Pack a `char` and a `CanonicalCombiningClass` in
412/// 32 bits (the former in the lower 24 bits and the
413/// latter in the high 8 bits). The latter can be
414/// initialized to 0xFF upon creation, in which case
415/// it can be actually set later by calling
416/// `set_ccc_from_trie_if_not_already_set`. This is
417/// a micro optimization to avoid the Canonical
418/// Combining Class trie lookup when there is only
419/// one combining character in a sequence. This type
420/// is intentionally non-`Copy` to get compiler help
421/// in making sure that the class is set on the
422/// instance on which it is intended to be set
423/// and not on a temporary copy.
424///
425/// Note that 0xFF is won't be assigned to an actual
426/// canonical combining class per definition D104
427/// in The Unicode Standard.
428//
429// NOTE: The Pernosco debugger has special knowledge
430// of this struct. Please do not change the bit layout
431// or the crate-module-qualified name of this struct
432// without coordination.
433#[derive(Debug)]
434struct CharacterAndClass(u32);
435
436impl CharacterAndClass {
437    pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self {
438        CharacterAndClass(u32::from(c) | (u32::from(ccc.to_icu4c_value()) << 24))
439    }
440    pub fn new_with_placeholder(c: char) -> Self {
441        CharacterAndClass(u32::from(c) | ((0xFF) << 24))
442    }
443    pub fn new_with_trie_value(c_tv: CharacterAndTrieValue) -> Self {
444        Self::new(c_tv.character, ccc_from_trie_value(c_tv.trie_val))
445    }
446    pub fn new_starter(c: char) -> Self {
447        CharacterAndClass(u32::from(c))
448    }
449    /// This method must exist for Pernosco to apply its special rendering.
450    /// Also, this must not be dead code!
451    pub fn character(&self) -> char {
452        // Safe, because the low 24 bits came from a `char`
453        // originally.
454        unsafe { char::from_u32_unchecked(self.0 & 0xFFFFFF) }
455    }
456    /// This method must exist for Pernosco to apply its special rendering.
457    pub fn ccc(&self) -> CanonicalCombiningClass {
458        CanonicalCombiningClass::from_icu4c_value((self.0 >> 24) as u8)
459    }
460
461    pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) {
462        (self.character(), self.ccc())
463    }
464    pub fn set_ccc_from_trie_if_not_already_set(&mut self, trie: &CodePointTrie<u32>) {
465        if self.0 >> 24 != 0xFF {
466            return;
467        }
468        let scalar = self.0 & 0xFFFFFF;
469        self.0 =
470            ((ccc_from_trie_value(trie.get32_u32(scalar)).to_icu4c_value() as u32) << 24) | scalar;
471    }
472}
473
474// This function exists as a borrow check helper.
475#[inline(always)]
476fn sort_slice_by_ccc(slice: &mut [CharacterAndClass], trie: &CodePointTrie<u32>) {
477    // We don't look up the canonical combining class for starters
478    // of for single combining characters between starters. When
479    // there's more than one combining character between starters,
480    // we look up the canonical combining class for each character
481    // exactly once.
482    if slice.len() < 2 {
483        return;
484    }
485    slice
486        .iter_mut()
487        .for_each(|cc| cc.set_ccc_from_trie_if_not_already_set(trie));
488    slice.sort_by_key(|cc| cc.ccc());
489}
490
491/// An iterator adaptor that turns an `Iterator` over `char` into
492/// a lazily-decomposed `char` sequence.
493#[derive(Debug)]
494pub struct Decomposition<'data, I>
495where
496    I: Iterator<Item = char>,
497{
498    delegate: I,
499    buffer: SmallVec<[CharacterAndClass; 17]>, // Enough to hold NFKD for U+FDFA
500    /// The index of the next item to be read from `buffer`.
501    /// The purpose if this index is to avoid having to move
502    /// the rest upon every read.
503    buffer_pos: usize,
504    // At the start of `next()` if not `None`, this is a pending unnormalized
505    // starter. When `Decomposition` appears alone, this is never a non-starter.
506    // However, when `Decomposition` appears inside a `Composition`, this
507    // may become a non-starter before `decomposing_next()` is called.
508    pending: Option<CharacterAndTrieValue>, // None at end of stream
509    // See trie-value-format.md
510    trie: &'data CodePointTrie<'data, u32>,
511    scalars16: &'data ZeroSlice<u16>,
512    scalars24: &'data ZeroSlice<char>,
513    supplementary_scalars16: &'data ZeroSlice<u16>,
514    supplementary_scalars24: &'data ZeroSlice<char>,
515    /// The lowest character for which either of the following does
516    /// not hold:
517    /// 1. Decomposes to self.
518    /// 2. Decomposition starts with a non-starter
519    decomposition_passthrough_bound: u32, // never above 0xC0
520    ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter
521}
522
523impl<'data, I> Decomposition<'data, I>
524where
525    I: Iterator<Item = char>,
526{
527    /// Constructs a decomposing iterator adapter from a delegate
528    /// iterator and references to the necessary data, without
529    /// supplementary data.
530    ///
531    /// Use `DecomposingNormalizer::normalize_iter()` instead unless
532    /// there's a good reason to use this constructor directly.
533    ///
534    /// Public but hidden in order to be able to use this from the
535    /// collator.
536    #[doc(hidden)] // used in collator
537    pub fn new(
538        delegate: I,
539        decompositions: &'data DecompositionData,
540        tables: &'data DecompositionTables,
541    ) -> Self {
542        Self::new_with_supplements(
543            delegate,
544            decompositions,
545            tables,
546            None,
547            0xC0,
548            IgnorableBehavior::Unsupported,
549        )
550    }
551
552    /// Constructs a decomposing iterator adapter from a delegate
553    /// iterator and references to the necessary data, including
554    /// supplementary data.
555    ///
556    /// Use `DecomposingNormalizer::normalize_iter()` instead unless
557    /// there's a good reason to use this constructor directly.
558    fn new_with_supplements(
559        delegate: I,
560        decompositions: &'data DecompositionData,
561        tables: &'data DecompositionTables,
562        supplementary_tables: Option<&'data DecompositionTables>,
563        decomposition_passthrough_bound: u8,
564        ignorable_behavior: IgnorableBehavior,
565    ) -> Self {
566        let mut ret = Decomposition::<I> {
567            delegate,
568            buffer: SmallVec::new(), // Normalized
569            buffer_pos: 0,
570            // Initialize with a placeholder starter in case
571            // the real stream starts with a non-starter.
572            pending: Some(CharacterAndTrieValue::new('\u{FFFF}', 0)),
573            trie: &decompositions.trie,
574            scalars16: &tables.scalars16,
575            scalars24: &tables.scalars24,
576            supplementary_scalars16: if let Some(supplementary) = supplementary_tables {
577                &supplementary.scalars16
578            } else {
579                EMPTY_U16
580            },
581            supplementary_scalars24: if let Some(supplementary) = supplementary_tables {
582                &supplementary.scalars24
583            } else {
584                EMPTY_CHAR
585            },
586            decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
587            ignorable_behavior,
588        };
589        let _ = ret.next(); // Remove the U+FFFF placeholder
590        ret
591    }
592
593    fn push_decomposition16(
594        &mut self,
595        offset: usize,
596        len: usize,
597        only_non_starters_in_trail: bool,
598        slice16: &ZeroSlice<u16>,
599    ) -> (char, usize) {
600        let (starter, tail) = slice16
601            .get_subslice(offset..offset + len)
602            .and_then(|slice| slice.split_first())
603            .map_or_else(
604                || {
605                    // GIGO case
606                    debug_assert!(false);
607                    (REPLACEMENT_CHARACTER, EMPTY_U16)
608                },
609                |(first, trail)| (char_from_u16(first), trail),
610            );
611        if only_non_starters_in_trail {
612            // All the rest are combining
613            self.buffer.extend(
614                tail.iter()
615                    .map(|u| CharacterAndClass::new_with_placeholder(char_from_u16(u))),
616            );
617            (starter, 0)
618        } else {
619            let mut i = 0;
620            let mut combining_start = 0;
621            for u in tail.iter() {
622                let ch = char_from_u16(u);
623                let trie_value = self.trie.get(ch);
624                self.buffer.push(CharacterAndClass::new_with_trie_value(
625                    CharacterAndTrieValue::new(ch, trie_value),
626                ));
627                i += 1;
628                // Half-width kana and iota subscript don't occur in the tails
629                // of these multicharacter decompositions.
630                if !decomposition_starts_with_non_starter(trie_value) {
631                    combining_start = i;
632                }
633            }
634            (starter, combining_start)
635        }
636    }
637
638    fn push_decomposition32(
639        &mut self,
640        offset: usize,
641        len: usize,
642        only_non_starters_in_trail: bool,
643        slice32: &ZeroSlice<char>,
644    ) -> (char, usize) {
645        let (starter, tail) = slice32
646            .get_subslice(offset..offset + len)
647            .and_then(|slice| slice.split_first())
648            .unwrap_or_else(|| {
649                // GIGO case
650                debug_assert!(false);
651                (REPLACEMENT_CHARACTER, EMPTY_CHAR)
652            });
653        if only_non_starters_in_trail {
654            // All the rest are combining
655            self.buffer
656                .extend(tail.iter().map(CharacterAndClass::new_with_placeholder));
657            (starter, 0)
658        } else {
659            let mut i = 0;
660            let mut combining_start = 0;
661            for ch in tail.iter() {
662                let trie_value = self.trie.get(ch);
663                self.buffer.push(CharacterAndClass::new_with_trie_value(
664                    CharacterAndTrieValue::new(ch, trie_value),
665                ));
666                i += 1;
667                // Half-width kana and iota subscript don't occur in the tails
668                // of these multicharacter decompositions.
669                if !decomposition_starts_with_non_starter(trie_value) {
670                    combining_start = i;
671                }
672            }
673            (starter, combining_start)
674        }
675    }
676
677    #[inline(always)]
678    fn attach_trie_value(&self, c: char) -> CharacterAndTrieValue {
679        CharacterAndTrieValue::new(c, self.trie.get(c))
680    }
681
682    fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
683        debug_assert!(self.pending.is_none());
684        loop {
685            let c = self.delegate.next()?;
686
687            // TODO(#2384): Measure if this check is actually an optimization.
688            if u32::from(c) < self.decomposition_passthrough_bound {
689                return Some(CharacterAndTrieValue::new(c, 0));
690            }
691
692            let trie_val = self.trie.get(c);
693            // TODO: Can we do something better about the cost of this branch in the
694            // non-UTS 46 case?
695            if trie_val == IGNORABLE_MARKER {
696                match self.ignorable_behavior {
697                    IgnorableBehavior::Unsupported => {
698                        debug_assert!(false);
699                    }
700                    IgnorableBehavior::ReplacementCharacter => {
701                        return Some(CharacterAndTrieValue::new(
702                            c,
703                            u32::from(REPLACEMENT_CHARACTER) | NON_ROUND_TRIP_MARKER,
704                        ));
705                    }
706                    IgnorableBehavior::Ignored => {
707                        // Else ignore this character by reading the next one from the delegate.
708                        continue;
709                    }
710                }
711            }
712            return Some(CharacterAndTrieValue::new(c, trie_val));
713        }
714    }
715
716    fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
717        if let Some(pending) = self.pending.take() {
718            // Only happens as part of `Composition` and as part of
719            // the contiguous-buffer methods of `DecomposingNormalizer`.
720            // I.e. does not happen as part of standalone iterator
721            // usage of `Decomposition`.
722            Some(pending)
723        } else {
724            self.delegate_next_no_pending()
725        }
726    }
727
728    fn decomposing_next(&mut self, c_and_trie_val: CharacterAndTrieValue) -> char {
729        let (starter, combining_start) = {
730            let c = c_and_trie_val.character;
731            // See trie-value-format.md
732            let decomposition = c_and_trie_val.trie_val;
733            // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
734            // and that flag needs to be ignored here.
735            if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
736                // The character is its own decomposition
737                (c, 0)
738            } else {
739                let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0;
740                let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0;
741                if !high_zeros && !low_zeros {
742                    // Decomposition into two BMP characters: starter and non-starter
743                    let starter = char_from_u32(decomposition & 0x7FFF);
744                    let combining = char_from_u32((decomposition >> 15) & 0x7FFF);
745                    self.buffer
746                        .push(CharacterAndClass::new_with_placeholder(combining));
747                    (starter, 0)
748                } else if high_zeros {
749                    // Do the check by looking at `c` instead of looking at a marker
750                    // in `singleton` below, because if we looked at the trie value,
751                    // we'd still have to check that `c` is in the Hangul syllable
752                    // range in order for the subsequent interpretations as `char`
753                    // to be safe.
754                    // Alternatively, `FDFA_MARKER` and the Hangul marker could
755                    // be unified. That would add a branch for Hangul and remove
756                    // a branch from singleton decompositions. It seems more
757                    // important to favor Hangul syllables than singleton
758                    // decompositions.
759                    // Note that it would be valid to hoist this Hangul check
760                    // one or even two steps earlier in this check hierarchy.
761                    // Right now, it's assumed the kind of decompositions into
762                    // BMP starter and non-starter, which occur in many languages,
763                    // should be checked before Hangul syllables, which are about
764                    // one language specifically. Hopefully, we get some
765                    // instruction-level parallelism out of the disjointness of
766                    // operations on `c` and `decomposition`.
767                    let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); // SIndex in the spec
768                    if hangul_offset < HANGUL_S_COUNT {
769                        debug_assert_eq!(decomposition, 1);
770                        // Hangul syllable
771                        // The math here comes from page 144 of Unicode 14.0
772                        let l = hangul_offset / HANGUL_N_COUNT;
773                        let v = (hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT;
774                        let t = hangul_offset % HANGUL_T_COUNT;
775
776                        // The unsafe blocks here are OK, because the values stay
777                        // within the Hangul jamo block and, therefore, the scalar
778                        // value range by construction.
779                        self.buffer.push(CharacterAndClass::new_starter(unsafe {
780                            core::char::from_u32_unchecked(HANGUL_V_BASE + v)
781                        }));
782                        let first = unsafe { core::char::from_u32_unchecked(HANGUL_L_BASE + l) };
783                        if t != 0 {
784                            self.buffer.push(CharacterAndClass::new_starter(unsafe {
785                                core::char::from_u32_unchecked(HANGUL_T_BASE + t)
786                            }));
787                            (first, 2)
788                        } else {
789                            (first, 1)
790                        }
791                    } else {
792                        let singleton = decomposition as u16;
793                        if singleton != FDFA_MARKER {
794                            // Decomposition into one BMP character
795                            let starter = char_from_u16(singleton);
796                            (starter, 0)
797                        } else {
798                            // Special case for the NFKD form of U+FDFA.
799                            self.buffer.extend(FDFA_NFKD.map(|u| {
800                                // SAFETY: `FDFA_NFKD` is known not to contain
801                                // surrogates.
802                                CharacterAndClass::new_starter(unsafe {
803                                    core::char::from_u32_unchecked(u32::from(u))
804                                })
805                            }));
806                            ('\u{0635}', 17)
807                        }
808                    }
809                } else {
810                    debug_assert!(low_zeros);
811                    // Only 12 of 14 bits used as of Unicode 16.
812                    let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1;
813                    // Only 3 of 4 bits used as of Unicode 16.
814                    let len_bits = decomposition & 0b1111;
815                    let only_non_starters_in_trail = (decomposition & 0b10000) != 0;
816                    if offset < self.scalars16.len() {
817                        self.push_decomposition16(
818                            offset,
819                            (len_bits + 2) as usize,
820                            only_non_starters_in_trail,
821                            self.scalars16,
822                        )
823                    } else if offset < self.scalars16.len() + self.scalars24.len() {
824                        self.push_decomposition32(
825                            offset - self.scalars16.len(),
826                            (len_bits + 1) as usize,
827                            only_non_starters_in_trail,
828                            self.scalars24,
829                        )
830                    } else if offset
831                        < self.scalars16.len()
832                            + self.scalars24.len()
833                            + self.supplementary_scalars16.len()
834                    {
835                        self.push_decomposition16(
836                            offset - (self.scalars16.len() + self.scalars24.len()),
837                            (len_bits + 2) as usize,
838                            only_non_starters_in_trail,
839                            self.supplementary_scalars16,
840                        )
841                    } else {
842                        self.push_decomposition32(
843                            offset
844                                - (self.scalars16.len()
845                                    + self.scalars24.len()
846                                    + self.supplementary_scalars16.len()),
847                            (len_bits + 1) as usize,
848                            only_non_starters_in_trail,
849                            self.supplementary_scalars24,
850                        )
851                    }
852                }
853            }
854        };
855        // Either we're inside `Composition` or `self.pending.is_none()`.
856
857        self.gather_and_sort_combining(combining_start);
858        starter
859    }
860
861    fn gather_and_sort_combining(&mut self, combining_start: usize) {
862        // Not a `for` loop to avoid holding a mutable reference to `self` across
863        // the loop body.
864        while let Some(ch_and_trie_val) = self.delegate_next() {
865            if !trie_value_has_ccc(ch_and_trie_val.trie_val) {
866                self.pending = Some(ch_and_trie_val);
867                break;
868            } else if !trie_value_indicates_special_non_starter_decomposition(
869                ch_and_trie_val.trie_val,
870            ) {
871                self.buffer
872                    .push(CharacterAndClass::new_with_trie_value(ch_and_trie_val));
873            } else {
874                // The Tibetan special cases are starters that decompose into non-starters.
875                let mapped = match ch_and_trie_val.character {
876                    '\u{0340}' => {
877                        // COMBINING GRAVE TONE MARK
878                        CharacterAndClass::new('\u{0300}', CCC_ABOVE)
879                    }
880                    '\u{0341}' => {
881                        // COMBINING ACUTE TONE MARK
882                        CharacterAndClass::new('\u{0301}', CCC_ABOVE)
883                    }
884                    '\u{0343}' => {
885                        // COMBINING GREEK KORONIS
886                        CharacterAndClass::new('\u{0313}', CCC_ABOVE)
887                    }
888                    '\u{0344}' => {
889                        // COMBINING GREEK DIALYTIKA TONOS
890                        self.buffer
891                            .push(CharacterAndClass::new('\u{0308}', CCC_ABOVE));
892                        CharacterAndClass::new('\u{0301}', CCC_ABOVE)
893                    }
894                    '\u{0F73}' => {
895                        // TIBETAN VOWEL SIGN II
896                        self.buffer
897                            .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
898                        CharacterAndClass::new('\u{0F72}', ccc!(CCC130, 130))
899                    }
900                    '\u{0F75}' => {
901                        // TIBETAN VOWEL SIGN UU
902                        self.buffer
903                            .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
904                        CharacterAndClass::new('\u{0F74}', ccc!(CCC132, 132))
905                    }
906                    '\u{0F81}' => {
907                        // TIBETAN VOWEL SIGN REVERSED II
908                        self.buffer
909                            .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
910                        CharacterAndClass::new('\u{0F80}', ccc!(CCC130, 130))
911                    }
912                    '\u{FF9E}' => {
913                        // HALFWIDTH KATAKANA VOICED SOUND MARK
914                        CharacterAndClass::new('\u{3099}', ccc!(KanaVoicing, 8))
915                    }
916                    '\u{FF9F}' => {
917                        // HALFWIDTH KATAKANA VOICED SOUND MARK
918                        CharacterAndClass::new('\u{309A}', ccc!(KanaVoicing, 8))
919                    }
920                    _ => {
921                        // GIGO case
922                        debug_assert!(false);
923                        CharacterAndClass::new_with_placeholder(REPLACEMENT_CHARACTER)
924                    }
925                };
926                self.buffer.push(mapped);
927            }
928        }
929        // Slicing succeeds by construction; we've always ensured that `combining_start`
930        // is in permissible range.
931        #[allow(clippy::indexing_slicing)]
932        sort_slice_by_ccc(&mut self.buffer[combining_start..], self.trie);
933    }
934}
935
936impl<I> Iterator for Decomposition<'_, I>
937where
938    I: Iterator<Item = char>,
939{
940    type Item = char;
941
942    fn next(&mut self) -> Option<char> {
943        if let Some(ret) = self.buffer.get(self.buffer_pos).map(|c| c.character()) {
944            self.buffer_pos += 1;
945            if self.buffer_pos == self.buffer.len() {
946                self.buffer.clear();
947                self.buffer_pos = 0;
948            }
949            return Some(ret);
950        }
951        debug_assert_eq!(self.buffer_pos, 0);
952        let c_and_trie_val = self.pending.take()?;
953        Some(self.decomposing_next(c_and_trie_val))
954    }
955}
956
957/// An iterator adaptor that turns an `Iterator` over `char` into
958/// a lazily-decomposed and then canonically composed `char` sequence.
959#[derive(Debug)]
960pub struct Composition<'data, I>
961where
962    I: Iterator<Item = char>,
963{
964    /// The decomposing part of the normalizer than operates before
965    /// the canonical composition is performed on its output.
966    decomposition: Decomposition<'data, I>,
967    /// Non-Hangul canonical composition data.
968    canonical_compositions: Char16Trie<'data>,
969    /// To make `next()` yield in cases where there's a non-composing
970    /// starter in the decomposition buffer, we put it here to let it
971    /// wait for the next `next()` call (or a jump forward within the
972    /// `next()` call).
973    unprocessed_starter: Option<char>,
974    /// The lowest character for which any one of the following does
975    /// not hold:
976    /// 1. Roundtrips via decomposition and recomposition.
977    /// 2. Decomposition starts with a non-starter
978    /// 3. Is not a backward-combining starter
979    composition_passthrough_bound: u32,
980}
981
982impl<'data, I> Composition<'data, I>
983where
984    I: Iterator<Item = char>,
985{
986    fn new(
987        decomposition: Decomposition<'data, I>,
988        canonical_compositions: Char16Trie<'data>,
989        composition_passthrough_bound: u16,
990    ) -> Self {
991        Self {
992            decomposition,
993            canonical_compositions,
994            unprocessed_starter: None,
995            composition_passthrough_bound: u32::from(composition_passthrough_bound),
996        }
997    }
998
999    /// Performs canonical composition (including Hangul) on a pair of
1000    /// characters or returns `None` if these characters don't compose.
1001    /// Composition exclusions are taken into account.
1002    #[inline(always)]
1003    pub fn compose(&self, starter: char, second: char) -> Option<char> {
1004        compose(self.canonical_compositions.iter(), starter, second)
1005    }
1006
1007    /// Performs (non-Hangul) canonical composition on a pair of characters
1008    /// or returns `None` if these characters don't compose. Composition
1009    /// exclusions are taken into account.
1010    #[inline(always)]
1011    fn compose_non_hangul(&self, starter: char, second: char) -> Option<char> {
1012        compose_non_hangul(self.canonical_compositions.iter(), starter, second)
1013    }
1014}
1015
1016impl<I> Iterator for Composition<'_, I>
1017where
1018    I: Iterator<Item = char>,
1019{
1020    type Item = char;
1021
1022    #[inline]
1023    fn next(&mut self) -> Option<char> {
1024        let mut undecomposed_starter = CharacterAndTrieValue::new('\u{0}', 0); // The compiler can't figure out that this gets overwritten before use.
1025        if self.unprocessed_starter.is_none() {
1026            // The loop is only broken out of as goto forward
1027            #[allow(clippy::never_loop)]
1028            loop {
1029                if let Some((character, ccc)) = self
1030                    .decomposition
1031                    .buffer
1032                    .get(self.decomposition.buffer_pos)
1033                    .map(|c| c.character_and_ccc())
1034                {
1035                    self.decomposition.buffer_pos += 1;
1036                    if self.decomposition.buffer_pos == self.decomposition.buffer.len() {
1037                        self.decomposition.buffer.clear();
1038                        self.decomposition.buffer_pos = 0;
1039                    }
1040                    if ccc == CCC_NOT_REORDERED {
1041                        // Previous decomposition contains a starter. This must
1042                        // now become the `unprocessed_starter` for it to have
1043                        // a chance to compose with the upcoming characters.
1044                        //
1045                        // E.g. parenthesized Hangul in NFKC comes through here,
1046                        // but suitable composition exclusion could exercise this
1047                        // in NFC.
1048                        self.unprocessed_starter = Some(character);
1049                        break; // We already have a starter, so skip taking one from `pending`.
1050                    }
1051                    return Some(character);
1052                }
1053                debug_assert_eq!(self.decomposition.buffer_pos, 0);
1054                undecomposed_starter = self.decomposition.pending.take()?;
1055                if u32::from(undecomposed_starter.character) < self.composition_passthrough_bound
1056                    || undecomposed_starter.potential_passthrough()
1057                {
1058                    // TODO(#2385): In the NFC case (moot for NFKC and UTS46), if the upcoming
1059                    // character is not below `decomposition_passthrough_bound` but is
1060                    // below `composition_passthrough_bound`, we read from the trie
1061                    // unnecessarily.
1062                    if let Some(upcoming) = self.decomposition.delegate_next_no_pending() {
1063                        let cannot_combine_backwards = u32::from(upcoming.character)
1064                            < self.composition_passthrough_bound
1065                            || !upcoming.can_combine_backwards();
1066                        self.decomposition.pending = Some(upcoming);
1067                        if cannot_combine_backwards {
1068                            // Fast-track succeeded!
1069                            return Some(undecomposed_starter.character);
1070                        }
1071                    } else {
1072                        // End of stream
1073                        return Some(undecomposed_starter.character);
1074                    }
1075                }
1076                break; // Not actually looping
1077            }
1078        }
1079        let mut starter = '\u{0}'; // The compiler can't figure out this gets overwritten before use.
1080
1081        // The point of having this boolean is to have only one call site to
1082        // `self.decomposition.decomposing_next`, which is hopefully beneficial for
1083        // code size under inlining.
1084        let mut attempt_composition = false;
1085        loop {
1086            if let Some(unprocessed) = self.unprocessed_starter.take() {
1087                debug_assert_eq!(undecomposed_starter, CharacterAndTrieValue::new('\u{0}', 0));
1088                debug_assert_eq!(starter, '\u{0}');
1089                starter = unprocessed;
1090            } else {
1091                debug_assert_eq!(self.decomposition.buffer_pos, 0);
1092                let next_starter = self.decomposition.decomposing_next(undecomposed_starter);
1093                if !attempt_composition {
1094                    starter = next_starter;
1095                } else if let Some(composed) = self.compose(starter, next_starter) {
1096                    starter = composed;
1097                } else {
1098                    // This is our yield point. We'll pick this up above in the
1099                    // next call to `next()`.
1100                    self.unprocessed_starter = Some(next_starter);
1101                    return Some(starter);
1102                }
1103            }
1104            // We first loop by index to avoid moving the contents of `buffer`, but
1105            // if there's a discontiguous match, we'll start modifying `buffer` instead.
1106            loop {
1107                let (character, ccc) = if let Some((character, ccc)) = self
1108                    .decomposition
1109                    .buffer
1110                    .get(self.decomposition.buffer_pos)
1111                    .map(|c| c.character_and_ccc())
1112                {
1113                    (character, ccc)
1114                } else {
1115                    self.decomposition.buffer.clear();
1116                    self.decomposition.buffer_pos = 0;
1117                    break;
1118                };
1119                if let Some(composed) = self.compose(starter, character) {
1120                    starter = composed;
1121                    self.decomposition.buffer_pos += 1;
1122                    continue;
1123                }
1124                let mut most_recent_skipped_ccc = ccc;
1125                {
1126                    let _ = self
1127                        .decomposition
1128                        .buffer
1129                        .drain(0..self.decomposition.buffer_pos);
1130                }
1131                self.decomposition.buffer_pos = 0;
1132                if most_recent_skipped_ccc == CCC_NOT_REORDERED {
1133                    // We failed to compose a starter. Discontiguous match not allowed.
1134                    // We leave the starter in `buffer` for `next()` to find.
1135                    return Some(starter);
1136                }
1137                let mut i = 1; // We have skipped one non-starter.
1138                while let Some((character, ccc)) = self
1139                    .decomposition
1140                    .buffer
1141                    .get(i)
1142                    .map(|c| c.character_and_ccc())
1143                {
1144                    if ccc == CCC_NOT_REORDERED {
1145                        // Discontiguous match not allowed.
1146                        return Some(starter);
1147                    }
1148                    debug_assert!(ccc >= most_recent_skipped_ccc);
1149                    if ccc != most_recent_skipped_ccc {
1150                        // Using the non-Hangul version as a micro-optimization, since
1151                        // we already rejected the case where `second` is a starter
1152                        // above, and conjoining jamo are starters.
1153                        if let Some(composed) = self.compose_non_hangul(starter, character) {
1154                            self.decomposition.buffer.remove(i);
1155                            starter = composed;
1156                            continue;
1157                        }
1158                    }
1159                    most_recent_skipped_ccc = ccc;
1160                    i += 1;
1161                }
1162                break;
1163            }
1164
1165            debug_assert_eq!(self.decomposition.buffer_pos, 0);
1166
1167            if !self.decomposition.buffer.is_empty() {
1168                return Some(starter);
1169            }
1170            // Now we need to check if composition with an upcoming starter is possible.
1171            #[allow(clippy::unwrap_used)]
1172            if self.decomposition.pending.is_some() {
1173                // We know that `pending_starter` decomposes to start with a starter.
1174                // Otherwise, it would have been moved to `self.decomposition.buffer`
1175                // by `self.decomposing_next()`. We do this set lookup here in order
1176                // to get an opportunity to go back to the fast track.
1177                // Note that this check has to happen _after_ checking that `pending`
1178                // holds a character, because this flag isn't defined to be meaningful
1179                // when `pending` isn't holding a character.
1180                let pending = self.decomposition.pending.as_ref().unwrap();
1181                if u32::from(pending.character) < self.composition_passthrough_bound
1182                    || !pending.can_combine_backwards()
1183                {
1184                    // Won't combine backwards anyway.
1185                    return Some(starter);
1186                }
1187                // Consume what we peeked. `unwrap` OK, because we checked `is_some()`
1188                // above.
1189                undecomposed_starter = self.decomposition.pending.take().unwrap();
1190                // The following line is OK, because we're about to loop back
1191                // to `self.decomposition.decomposing_next(c);`, which will
1192                // restore the between-`next()`-calls invariant of `pending`
1193                // before this function returns.
1194                attempt_composition = true;
1195                continue;
1196            }
1197            // End of input
1198            return Some(starter);
1199        }
1200    }
1201}
1202
1203macro_rules! composing_normalize_to {
1204    ($(#[$meta:meta])*,
1205     $normalize_to:ident,
1206     $write:path,
1207     $slice:ty,
1208     $prolog:block,
1209     $always_valid_utf:literal,
1210     $as_slice:ident,
1211     $fast:block,
1212     $text:ident,
1213     $sink:ident,
1214     $composition:ident,
1215     $composition_passthrough_bound:ident,
1216     $undecomposed_starter:ident,
1217     $pending_slice:ident,
1218     $len_utf:ident,
1219    ) => {
1220        $(#[$meta])*
1221        pub fn $normalize_to<W: $write + ?Sized>(
1222            &self,
1223            $text: $slice,
1224            $sink: &mut W,
1225        ) -> core::fmt::Result {
1226            $prolog
1227            let mut $composition = self.normalize_iter($text.chars());
1228            debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1229            for cc in $composition.decomposition.buffer.drain(..) {
1230                $sink.write_char(cc.character())?;
1231            }
1232
1233            // Try to get the compiler to hoist the bound to a register.
1234            let $composition_passthrough_bound = $composition.composition_passthrough_bound;
1235            'outer: loop {
1236                debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1237                let mut $undecomposed_starter =
1238                    if let Some(pending) = $composition.decomposition.pending.take() {
1239                        pending
1240                    } else {
1241                        return Ok(());
1242                    };
1243                // Allowing indexed slicing, because a failure would be a code bug and
1244                // not a data issue.
1245                #[allow(clippy::indexing_slicing)]
1246                if u32::from($undecomposed_starter.character) < $composition_passthrough_bound ||
1247                    $undecomposed_starter.potential_passthrough()
1248                {
1249                    // We don't know if a `REPLACEMENT_CHARACTER` occurred in the slice or
1250                    // was returned in response to an error by the iterator. Assume the
1251                    // latter for correctness even though it pessimizes the former.
1252                    if $always_valid_utf || $undecomposed_starter.character != REPLACEMENT_CHARACTER {
1253                        let $pending_slice = &$text[$text.len() - $composition.decomposition.delegate.$as_slice().len() - $undecomposed_starter.character.$len_utf()..];
1254                        // The `$fast` block must either:
1255                        // 1. Return due to reaching EOF
1256                        // 2. Leave a starter with its trie value in `$undecomposed_starter`
1257                        //    and, if there is still more input, leave the next character
1258                        //    and its trie value in `$composition.decomposition.pending`.
1259                        $fast
1260                    }
1261                }
1262                // Fast track above, full algorithm below
1263                let mut starter = $composition
1264                    .decomposition
1265                    .decomposing_next($undecomposed_starter);
1266                'bufferloop: loop {
1267                    // We first loop by index to avoid moving the contents of `buffer`, but
1268                    // if there's a discontiguous match, we'll start modifying `buffer` instead.
1269                    loop {
1270                        let (character, ccc) = if let Some((character, ccc)) = $composition
1271                            .decomposition
1272                            .buffer
1273                            .get($composition.decomposition.buffer_pos)
1274                            .map(|c| c.character_and_ccc())
1275                        {
1276                            (character, ccc)
1277                        } else {
1278                            $composition.decomposition.buffer.clear();
1279                            $composition.decomposition.buffer_pos = 0;
1280                            break;
1281                        };
1282                        if let Some(composed) = $composition.compose(starter, character) {
1283                            starter = composed;
1284                            $composition.decomposition.buffer_pos += 1;
1285                            continue;
1286                        }
1287                        let mut most_recent_skipped_ccc = ccc;
1288                        if most_recent_skipped_ccc == CCC_NOT_REORDERED {
1289                            // We failed to compose a starter. Discontiguous match not allowed.
1290                            // Write the current `starter` we've been composing, make the unmatched
1291                            // starter in the buffer the new `starter` (we know it's been decomposed)
1292                            // and process the rest of the buffer with that as the starter.
1293                            $sink.write_char(starter)?;
1294                            starter = character;
1295                            $composition.decomposition.buffer_pos += 1;
1296                            continue 'bufferloop;
1297                        } else {
1298                            {
1299                                let _ = $composition
1300                                    .decomposition
1301                                    .buffer
1302                                    .drain(0..$composition.decomposition.buffer_pos);
1303                            }
1304                            $composition.decomposition.buffer_pos = 0;
1305                        }
1306                        let mut i = 1; // We have skipped one non-starter.
1307                        while let Some((character, ccc)) = $composition
1308                            .decomposition
1309                            .buffer
1310                            .get(i)
1311                            .map(|c| c.character_and_ccc())
1312                        {
1313                            if ccc == CCC_NOT_REORDERED {
1314                                // Discontiguous match not allowed.
1315                                $sink.write_char(starter)?;
1316                                for cc in $composition.decomposition.buffer.drain(..i) {
1317                                    $sink.write_char(cc.character())?;
1318                                }
1319                                starter = character;
1320                                {
1321                                    let removed = $composition.decomposition.buffer.remove(0);
1322                                    debug_assert_eq!(starter, removed.character());
1323                                }
1324                                debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1325                                continue 'bufferloop;
1326                            }
1327                            debug_assert!(ccc >= most_recent_skipped_ccc);
1328                            if ccc != most_recent_skipped_ccc {
1329                                // Using the non-Hangul version as a micro-optimization, since
1330                                // we already rejected the case where `second` is a starter
1331                                // above, and conjoining jamo are starters.
1332                                if let Some(composed) =
1333                                    $composition.compose_non_hangul(starter, character)
1334                                {
1335                                    $composition.decomposition.buffer.remove(i);
1336                                    starter = composed;
1337                                    continue;
1338                                }
1339                            }
1340                            most_recent_skipped_ccc = ccc;
1341                            i += 1;
1342                        }
1343                        break;
1344                    }
1345                    debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1346
1347                    if !$composition.decomposition.buffer.is_empty() {
1348                        $sink.write_char(starter)?;
1349                        for cc in $composition.decomposition.buffer.drain(..) {
1350                            $sink.write_char(cc.character())?;
1351                        }
1352                        // We had non-empty buffer, so can't compose with upcoming.
1353                        continue 'outer;
1354                    }
1355                    // Now we need to check if composition with an upcoming starter is possible.
1356                    if $composition.decomposition.pending.is_some() {
1357                        // We know that `pending_starter` decomposes to start with a starter.
1358                        // Otherwise, it would have been moved to `composition.decomposition.buffer`
1359                        // by `composition.decomposing_next()`. We do this set lookup here in order
1360                        // to get an opportunity to go back to the fast track.
1361                        // Note that this check has to happen _after_ checking that `pending`
1362                        // holds a character, because this flag isn't defined to be meaningful
1363                        // when `pending` isn't holding a character.
1364                        let pending = $composition.decomposition.pending.as_ref().unwrap();
1365                        if u32::from(pending.character) < $composition.composition_passthrough_bound
1366                            || !pending.can_combine_backwards()
1367                        {
1368                            // Won't combine backwards anyway.
1369                            $sink.write_char(starter)?;
1370                            continue 'outer;
1371                        }
1372                        let pending_starter = $composition.decomposition.pending.take().unwrap();
1373                        let decomposed = $composition.decomposition.decomposing_next(pending_starter);
1374                        if let Some(composed) = $composition.compose(starter, decomposed) {
1375                            starter = composed;
1376                        } else {
1377                            $sink.write_char(starter)?;
1378                            starter = decomposed;
1379                        }
1380                        continue 'bufferloop;
1381                    }
1382                    // End of input
1383                    $sink.write_char(starter)?;
1384                    return Ok(());
1385                } // 'bufferloop
1386            }
1387        }
1388    };
1389}
1390
1391macro_rules! decomposing_normalize_to {
1392    ($(#[$meta:meta])*,
1393     $normalize_to:ident,
1394     $write:path,
1395     $slice:ty,
1396     $prolog:block,
1397     $as_slice:ident,
1398     $fast:block,
1399     $text:ident,
1400     $sink:ident,
1401     $decomposition:ident,
1402     $decomposition_passthrough_bound:ident,
1403     $undecomposed_starter:ident,
1404     $pending_slice:ident,
1405     $outer:lifetime, // loop labels use lifetime tokens
1406    ) => {
1407        $(#[$meta])*
1408        pub fn $normalize_to<W: $write + ?Sized>(
1409            &self,
1410            $text: $slice,
1411            $sink: &mut W,
1412        ) -> core::fmt::Result {
1413            $prolog
1414
1415            let mut $decomposition = self.normalize_iter($text.chars());
1416            debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1417
1418            // Try to get the compiler to hoist the bound to a register.
1419            let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
1420            $outer: loop {
1421                for cc in $decomposition.buffer.drain(..) {
1422                    $sink.write_char(cc.character())?;
1423                }
1424                debug_assert_eq!($decomposition.buffer_pos, 0);
1425                let mut $undecomposed_starter = if let Some(pending) = $decomposition.pending.take() {
1426                    pending
1427                } else {
1428                    return Ok(());
1429                };
1430                // Allowing indexed slicing, because a failure would be a code bug and
1431                // not a data issue.
1432                #[allow(clippy::indexing_slicing)]
1433                if $undecomposed_starter.starter_and_decomposes_to_self() {
1434                    // Don't bother including `undecomposed_starter` in a contiguous buffer
1435                    // write: Just write it right away:
1436                    $sink.write_char($undecomposed_starter.character)?;
1437
1438                    let $pending_slice = $decomposition.delegate.$as_slice();
1439                    $fast
1440                }
1441                let starter = $decomposition.decomposing_next($undecomposed_starter);
1442                $sink.write_char(starter)?;
1443            }
1444        }
1445    };
1446}
1447
1448macro_rules! normalizer_methods {
1449    () => {
1450        /// Normalize a string slice into a `Cow<'a, str>`.
1451        pub fn normalize<'a>(&self, text: &'a str) -> Cow<'a, str> {
1452            let (head, tail) = self.split_normalized(text);
1453            if tail.is_empty() {
1454                return Cow::Borrowed(head);
1455            }
1456            let mut ret = String::new();
1457            ret.reserve(text.len());
1458            ret.push_str(head);
1459            let _ = self.normalize_to(tail, &mut ret);
1460            Cow::Owned(ret)
1461        }
1462
1463        /// Split a string slice into maximum normalized prefix and unnormalized suffix
1464        /// such that the concatenation of the prefix and the normalization of the suffix
1465        /// is the normalization of the whole input.
1466        pub fn split_normalized<'a>(&self, text: &'a str) -> (&'a str, &'a str) {
1467            let up_to = self.is_normalized_up_to(text);
1468            text.split_at_checked(up_to).unwrap_or_else(|| {
1469                // Internal bug, not even GIGO, never supposed to happen
1470                debug_assert!(false);
1471                ("", text)
1472            })
1473        }
1474
1475        /// Return the index a string slice is normalized up to.
1476        fn is_normalized_up_to(&self, text: &str) -> usize {
1477            let mut sink = IsNormalizedSinkStr::new(text);
1478            let _ = self.normalize_to(text, &mut sink);
1479            text.len() - sink.remaining_len()
1480        }
1481
1482        /// Check whether a string slice is normalized.
1483        pub fn is_normalized(&self, text: &str) -> bool {
1484            self.is_normalized_up_to(text) == text.len()
1485        }
1486
1487        /// Normalize a slice of potentially-invalid UTF-16 into a `Cow<'a, [u16]>`.
1488        ///
1489        /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
1490        /// before normalizing.
1491        ///
1492        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1493        #[cfg(feature = "utf16_iter")]
1494        pub fn normalize_utf16<'a>(&self, text: &'a [u16]) -> Cow<'a, [u16]> {
1495            let (head, tail) = self.split_normalized_utf16(text);
1496            if tail.is_empty() {
1497                return Cow::Borrowed(head);
1498            }
1499            let mut ret = alloc::vec::Vec::with_capacity(text.len());
1500            ret.extend_from_slice(head);
1501            let _ = self.normalize_utf16_to(tail, &mut ret);
1502            Cow::Owned(ret)
1503        }
1504
1505        /// Split a slice of potentially-invalid UTF-16 into maximum normalized (and valid)
1506        /// prefix and unnormalized suffix such that the concatenation of the prefix and the
1507        /// normalization of the suffix is the normalization of the whole input.
1508        ///
1509        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1510        #[cfg(feature = "utf16_iter")]
1511        pub fn split_normalized_utf16<'a>(&self, text: &'a [u16]) -> (&'a [u16], &'a [u16]) {
1512            let up_to = self.is_normalized_utf16_up_to(text);
1513            text.split_at_checked(up_to).unwrap_or_else(|| {
1514                // Internal bug, not even GIGO, never supposed to happen
1515                debug_assert!(false);
1516                (&[], text)
1517            })
1518        }
1519
1520        /// Return the index a slice of potentially-invalid UTF-16 is normalized up to.
1521        ///
1522        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1523        #[cfg(feature = "utf16_iter")]
1524        fn is_normalized_utf16_up_to(&self, text: &[u16]) -> usize {
1525            let mut sink = IsNormalizedSinkUtf16::new(text);
1526            let _ = self.normalize_utf16_to(text, &mut sink);
1527            text.len() - sink.remaining_len()
1528        }
1529
1530        /// Checks whether a slice of potentially-invalid UTF-16 is normalized.
1531        ///
1532        /// Unpaired surrogates are treated as the REPLACEMENT CHARACTER.
1533        ///
1534        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1535        #[cfg(feature = "utf16_iter")]
1536        pub fn is_normalized_utf16(&self, text: &[u16]) -> bool {
1537            self.is_normalized_utf16_up_to(text) == text.len()
1538        }
1539
1540        /// Normalize a slice of potentially-invalid UTF-8 into a `Cow<'a, str>`.
1541        ///
1542        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1543        /// according to the WHATWG Encoding Standard.
1544        ///
1545        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1546        #[cfg(feature = "utf8_iter")]
1547        pub fn normalize_utf8<'a>(&self, text: &'a [u8]) -> Cow<'a, str> {
1548            let (head, tail) = self.split_normalized_utf8(text);
1549            if tail.is_empty() {
1550                return Cow::Borrowed(head);
1551            }
1552            let mut ret = String::new();
1553            ret.reserve(text.len());
1554            ret.push_str(head);
1555            let _ = self.normalize_utf8_to(tail, &mut ret);
1556            Cow::Owned(ret)
1557        }
1558
1559        /// Split a slice of potentially-invalid UTF-8 into maximum normalized (and valid)
1560        /// prefix and unnormalized suffix such that the concatenation of the prefix and the
1561        /// normalization of the suffix is the normalization of the whole input.
1562        ///
1563        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1564        #[cfg(feature = "utf8_iter")]
1565        pub fn split_normalized_utf8<'a>(&self, text: &'a [u8]) -> (&'a str, &'a [u8]) {
1566            let up_to = self.is_normalized_utf8_up_to(text);
1567            let (head, tail) = text.split_at_checked(up_to).unwrap_or_else(|| {
1568                // Internal bug, not even GIGO, never supposed to happen
1569                debug_assert!(false);
1570                (&[], text)
1571            });
1572            // SAFETY: The normalization check also checks for
1573            // UTF-8 well-formedness.
1574            (unsafe { core::str::from_utf8_unchecked(head) }, tail)
1575        }
1576
1577        /// Return the index a slice of potentially-invalid UTF-8 is normalized up to
1578        ///
1579        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1580        #[cfg(feature = "utf8_iter")]
1581        fn is_normalized_utf8_up_to(&self, text: &[u8]) -> usize {
1582            let mut sink = IsNormalizedSinkUtf8::new(text);
1583            let _ = self.normalize_utf8_to(text, &mut sink);
1584            text.len() - sink.remaining_len()
1585        }
1586
1587        /// Check if a slice of potentially-invalid UTF-8 is normalized.
1588        ///
1589        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1590        /// according to the WHATWG Encoding Standard before checking.
1591        ///
1592        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1593        #[cfg(feature = "utf8_iter")]
1594        pub fn is_normalized_utf8(&self, text: &[u8]) -> bool {
1595            self.is_normalized_utf8_up_to(text) == text.len()
1596        }
1597    };
1598}
1599
1600/// Borrowed version of a normalizer for performing decomposing normalization.
1601#[derive(Debug)]
1602pub struct DecomposingNormalizerBorrowed<'a> {
1603    decompositions: &'a DecompositionData<'a>,
1604    tables: &'a DecompositionTables<'a>,
1605    supplementary_tables: Option<&'a DecompositionTables<'a>>,
1606    decomposition_passthrough_bound: u8, // never above 0xC0
1607    composition_passthrough_bound: u16,  // never above 0x0300
1608}
1609
1610impl DecomposingNormalizerBorrowed<'static> {
1611    /// Cheaply converts a [`DecomposingNormalizerBorrowed<'static>`] into a [`DecomposingNormalizer`].
1612    ///
1613    /// Note: Due to branching and indirection, using [`DecomposingNormalizer`] might inhibit some
1614    /// compile-time optimizations that are possible with [`DecomposingNormalizerBorrowed`].
1615    pub const fn static_to_owned(self) -> DecomposingNormalizer {
1616        DecomposingNormalizer {
1617            decompositions: DataPayload::from_static_ref(self.decompositions),
1618            tables: DataPayload::from_static_ref(self.tables),
1619            supplementary_tables: if let Some(s) = self.supplementary_tables {
1620                // `map` not available in const context
1621                Some(DataPayload::from_static_ref(s))
1622            } else {
1623                None
1624            },
1625            decomposition_passthrough_bound: self.decomposition_passthrough_bound,
1626            composition_passthrough_bound: self.composition_passthrough_bound,
1627        }
1628    }
1629
1630    /// NFD constructor using compiled data.
1631    ///
1632    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1633    ///
1634    /// [📚 Help choosing a constructor](icu_provider::constructors)
1635    #[cfg(feature = "compiled_data")]
1636    pub const fn new_nfd() -> Self {
1637        const _: () = assert!(
1638            crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1639                .scalars16
1640                .const_len()
1641                + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1642                    .scalars24
1643                    .const_len()
1644                <= 0xFFF,
1645            "future extension"
1646        );
1647
1648        DecomposingNormalizerBorrowed {
1649            decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1,
1650            tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1651            supplementary_tables: None,
1652            decomposition_passthrough_bound: 0xC0,
1653            composition_passthrough_bound: 0x0300,
1654        }
1655    }
1656
1657    /// NFKD constructor using compiled data.
1658    ///
1659    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1660    ///
1661    /// [📚 Help choosing a constructor](icu_provider::constructors)
1662    #[cfg(feature = "compiled_data")]
1663    pub const fn new_nfkd() -> Self {
1664        const _: () = assert!(
1665            crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1666                .scalars16
1667                .const_len()
1668                + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1669                    .scalars24
1670                    .const_len()
1671                + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1672                    .scalars16
1673                    .const_len()
1674                + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1675                    .scalars24
1676                    .const_len()
1677                <= 0xFFF,
1678            "future extension"
1679        );
1680
1681        const _: () = assert!(
1682            crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap <= 0x0300,
1683            "invalid"
1684        );
1685
1686        let decomposition_capped =
1687            if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0xC0 {
1688                crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
1689            } else {
1690                0xC0
1691            };
1692        let composition_capped =
1693            if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0x0300 {
1694                crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
1695            } else {
1696                0x0300
1697            };
1698
1699        DecomposingNormalizerBorrowed {
1700            decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1,
1701            tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1702            supplementary_tables: Some(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
1703            decomposition_passthrough_bound: decomposition_capped as u8,
1704            composition_passthrough_bound: composition_capped,
1705        }
1706    }
1707
1708    #[cfg(feature = "compiled_data")]
1709    pub(crate) const fn new_uts46_decomposed() -> Self {
1710        const _: () = assert!(
1711            crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1712                .scalars16
1713                .const_len()
1714                + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1715                    .scalars24
1716                    .const_len()
1717                + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1718                    .scalars16
1719                    .const_len()
1720                + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1721                    .scalars24
1722                    .const_len()
1723                <= 0xFFF,
1724            "future extension"
1725        );
1726
1727        const _: () = assert!(
1728            crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap <= 0x0300,
1729            "invalid"
1730        );
1731
1732        let decomposition_capped =
1733            if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap < 0xC0 {
1734                crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
1735            } else {
1736                0xC0
1737            };
1738        let composition_capped = if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1
1739            .passthrough_cap
1740            < 0x0300
1741        {
1742            crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
1743        } else {
1744            0x0300
1745        };
1746
1747        DecomposingNormalizerBorrowed {
1748            decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1,
1749            tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1750            supplementary_tables: Some(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
1751            decomposition_passthrough_bound: decomposition_capped as u8,
1752            composition_passthrough_bound: composition_capped,
1753        }
1754    }
1755}
1756
1757impl<'data> DecomposingNormalizerBorrowed<'data> {
1758    /// Wraps a delegate iterator into a decomposing iterator
1759    /// adapter by using the data already held by this normalizer.
1760    pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Decomposition<'data, I> {
1761        Decomposition::new_with_supplements(
1762            iter,
1763            self.decompositions,
1764            self.tables,
1765            self.supplementary_tables,
1766            self.decomposition_passthrough_bound,
1767            IgnorableBehavior::Unsupported,
1768        )
1769    }
1770
1771    normalizer_methods!();
1772
1773    decomposing_normalize_to!(
1774        /// Normalize a string slice into a `Write` sink.
1775        ,
1776        normalize_to,
1777        core::fmt::Write,
1778        &str,
1779        {
1780        },
1781        as_str,
1782        {
1783            let decomposition_passthrough_byte_bound = if decomposition_passthrough_bound == 0xC0 {
1784                0xC3u8
1785            } else {
1786                decomposition_passthrough_bound.min(0x80) as u8
1787            };
1788            // The attribute belongs on an inner statement, but Rust doesn't allow it there.
1789            #[allow(clippy::unwrap_used)]
1790            'fast: loop {
1791                let mut code_unit_iter = decomposition.delegate.as_str().as_bytes().iter();
1792                'fastest: loop {
1793                    if let Some(&upcoming_byte) = code_unit_iter.next() {
1794                        if upcoming_byte < decomposition_passthrough_byte_bound {
1795                            // Fast-track succeeded!
1796                            continue 'fastest;
1797                        }
1798                        decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
1799                        break 'fastest;
1800                    }
1801                    // End of stream
1802                    sink.write_str(pending_slice)?;
1803                    return Ok(());
1804                }
1805
1806                // `unwrap()` OK, because the slice is valid UTF-8 and we know there
1807                // is an upcoming byte.
1808                let upcoming = decomposition.delegate.next().unwrap();
1809                let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1810                if upcoming_with_trie_value.starter_and_decomposes_to_self() {
1811                    continue 'fast;
1812                }
1813                let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1814                    - decomposition.delegate.as_str().len()
1815                    - upcoming.len_utf8()];
1816                sink.write_str(consumed_so_far_slice)?;
1817
1818                // Now let's figure out if we got a starter or a non-starter.
1819                if decomposition_starts_with_non_starter(
1820                    upcoming_with_trie_value.trie_val,
1821                ) {
1822                    // Let this trie value to be reprocessed in case it is
1823                    // one of the rare decomposing ones.
1824                    decomposition.pending = Some(upcoming_with_trie_value);
1825                    decomposition.gather_and_sort_combining(0);
1826                    continue 'outer;
1827                }
1828                undecomposed_starter = upcoming_with_trie_value;
1829                debug_assert!(decomposition.pending.is_none());
1830                break 'fast;
1831            }
1832        },
1833        text,
1834        sink,
1835        decomposition,
1836        decomposition_passthrough_bound,
1837        undecomposed_starter,
1838        pending_slice,
1839        'outer,
1840    );
1841
1842    decomposing_normalize_to!(
1843        /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
1844        ///
1845        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1846        /// according to the WHATWG Encoding Standard.
1847        ///
1848        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1849        #[cfg(feature = "utf8_iter")]
1850        ,
1851        normalize_utf8_to,
1852        core::fmt::Write,
1853        &[u8],
1854        {
1855        },
1856        as_slice,
1857        {
1858            let decomposition_passthrough_byte_bound = decomposition_passthrough_bound.min(0x80) as u8;
1859            // The attribute belongs on an inner statement, but Rust doesn't allow it there.
1860            #[allow(clippy::unwrap_used)]
1861            'fast: loop {
1862                let mut code_unit_iter = decomposition.delegate.as_slice().iter();
1863                'fastest: loop {
1864                    if let Some(&upcoming_byte) = code_unit_iter.next() {
1865                        if upcoming_byte < decomposition_passthrough_byte_bound {
1866                            // Fast-track succeeded!
1867                            continue 'fastest;
1868                        }
1869                        break 'fastest;
1870                    }
1871                    // End of stream
1872                    sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
1873                    return Ok(());
1874                }
1875                decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
1876
1877                // `unwrap()` OK, because the slice is valid UTF-8 and we know there
1878                // is an upcoming byte.
1879                let upcoming = decomposition.delegate.next().unwrap();
1880                let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1881                if upcoming_with_trie_value.starter_and_decomposes_to_self_except_replacement() {
1882                    // Note: The trie value of the REPLACEMENT CHARACTER is
1883                    // intentionally formatted to fail the
1884                    // `starter_and_decomposes_to_self` test even though it
1885                    // really is a starter that decomposes to self. This
1886                    // Allows moving the branch on REPLACEMENT CHARACTER
1887                    // below this `continue`.
1888                    continue 'fast;
1889                }
1890
1891                // TODO: Annotate as unlikely.
1892                if upcoming == REPLACEMENT_CHARACTER {
1893                    // We might have an error, so fall out of the fast path.
1894
1895                    // Since the U+FFFD might signify an error, we can't
1896                    // assume `upcoming.len_utf8()` for the backoff length.
1897                    let mut consumed_so_far = pending_slice[..pending_slice.len() - decomposition.delegate.as_slice().len()].chars();
1898                    let back = consumed_so_far.next_back();
1899                    debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
1900                    let consumed_so_far_slice = consumed_so_far.as_slice();
1901                    sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
1902
1903                    // We could call `gather_and_sort_combining` here and
1904                    // `continue 'outer`, but this should be better for code
1905                    // size.
1906                    undecomposed_starter = upcoming_with_trie_value;
1907                    debug_assert!(decomposition.pending.is_none());
1908                    break 'fast;
1909                }
1910
1911                let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1912                    - decomposition.delegate.as_slice().len()
1913                    - upcoming.len_utf8()];
1914                sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
1915
1916                // Now let's figure out if we got a starter or a non-starter.
1917                if decomposition_starts_with_non_starter(
1918                    upcoming_with_trie_value.trie_val,
1919                ) {
1920                    // Let this trie value to be reprocessed in case it is
1921                    // one of the rare decomposing ones.
1922                    decomposition.pending = Some(upcoming_with_trie_value);
1923                    decomposition.gather_and_sort_combining(0);
1924                    continue 'outer;
1925                }
1926                undecomposed_starter = upcoming_with_trie_value;
1927                debug_assert!(decomposition.pending.is_none());
1928                break 'fast;
1929            }
1930        },
1931        text,
1932        sink,
1933        decomposition,
1934        decomposition_passthrough_bound,
1935        undecomposed_starter,
1936        pending_slice,
1937        'outer,
1938    );
1939
1940    decomposing_normalize_to!(
1941        /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
1942        ///
1943        /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
1944        /// before normalizing.
1945        ///
1946        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1947        #[cfg(feature = "utf16_iter")]
1948        ,
1949        normalize_utf16_to,
1950        write16::Write16,
1951        &[u16],
1952        {
1953            sink.size_hint(text.len())?;
1954        },
1955        as_slice,
1956        {
1957            let mut code_unit_iter = decomposition.delegate.as_slice().iter();
1958            'fast: loop {
1959                if let Some(&upcoming_code_unit) = code_unit_iter.next() {
1960                    let mut upcoming32 = u32::from(upcoming_code_unit);
1961                    if upcoming32 < decomposition_passthrough_bound {
1962                        continue 'fast;
1963                    }
1964                    // We might be doing a trie lookup by surrogate. Surrogates get
1965                    // a decomposition to U+FFFD.
1966                    let mut trie_value = decomposition.trie.get32(upcoming32);
1967                    if starter_and_decomposes_to_self_impl(trie_value) {
1968                        continue 'fast;
1969                    }
1970                    // We might now be looking at a surrogate.
1971                    // The loop is only broken out of as goto forward
1972                    #[allow(clippy::never_loop)]
1973                    'surrogateloop: loop {
1974                        let surrogate_base = upcoming32.wrapping_sub(0xD800);
1975                        if surrogate_base > (0xDFFF - 0xD800) {
1976                            // Not surrogate
1977                            break 'surrogateloop;
1978                        }
1979                        if surrogate_base <= (0xDBFF - 0xD800) {
1980                            let iter_backup = code_unit_iter.clone();
1981                            if let Some(&low) = code_unit_iter.next() {
1982                                if in_inclusive_range16(low, 0xDC00, 0xDFFF) {
1983                                    upcoming32 = (upcoming32 << 10) + u32::from(low)
1984                                        - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
1985                                    // Successfully-paired surrogate. Read from the trie again.
1986                                    trie_value = decomposition.trie.get32(upcoming32);
1987                                    if starter_and_decomposes_to_self_impl(trie_value) {
1988                                        continue 'fast;
1989                                    }
1990                                    break 'surrogateloop;
1991                                } else {
1992                                    code_unit_iter = iter_backup;
1993                                }
1994                            }
1995                        }
1996                        // unpaired surrogate
1997                        upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
1998                        // trie_value already holds a decomposition to U+FFFD.
1999                        break 'surrogateloop;
2000                    }
2001
2002                    let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2003                    let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
2004
2005                    let consumed_so_far_slice = &pending_slice[..pending_slice.len()
2006                        - code_unit_iter.as_slice().len()
2007                        - upcoming.len_utf16()];
2008                    sink.write_slice(consumed_so_far_slice)?;
2009
2010                    // Now let's figure out if we got a starter or a non-starter.
2011                    if decomposition_starts_with_non_starter(
2012                        upcoming_with_trie_value.trie_val,
2013                    ) {
2014                        // Sync with main iterator
2015                        decomposition.delegate = code_unit_iter.as_slice().chars();
2016                        // Let this trie value to be reprocessed in case it is
2017                        // one of the rare decomposing ones.
2018                        decomposition.pending = Some(upcoming_with_trie_value);
2019                        decomposition.gather_and_sort_combining(0);
2020                        continue 'outer;
2021                    }
2022                    undecomposed_starter = upcoming_with_trie_value;
2023                    debug_assert!(decomposition.pending.is_none());
2024                    break 'fast;
2025                }
2026                // End of stream
2027                sink.write_slice(pending_slice)?;
2028                return Ok(());
2029            }
2030            // Sync the main iterator
2031            decomposition.delegate = code_unit_iter.as_slice().chars();
2032        },
2033        text,
2034        sink,
2035        decomposition,
2036        decomposition_passthrough_bound,
2037        undecomposed_starter,
2038        pending_slice,
2039        'outer,
2040    );
2041}
2042
2043/// A normalizer for performing decomposing normalization.
2044#[derive(Debug)]
2045pub struct DecomposingNormalizer {
2046    decompositions: DataPayload<NormalizerNfdDataV1>,
2047    tables: DataPayload<NormalizerNfdTablesV1>,
2048    supplementary_tables: Option<DataPayload<NormalizerNfkdTablesV1>>,
2049    decomposition_passthrough_bound: u8, // never above 0xC0
2050    composition_passthrough_bound: u16,  // never above 0x0300
2051}
2052
2053impl DecomposingNormalizer {
2054    /// Constructs a borrowed version of this type for more efficient querying.
2055    pub fn as_borrowed(&self) -> DecomposingNormalizerBorrowed {
2056        DecomposingNormalizerBorrowed {
2057            decompositions: self.decompositions.get(),
2058            tables: self.tables.get(),
2059            supplementary_tables: self.supplementary_tables.as_ref().map(|s| s.get()),
2060            decomposition_passthrough_bound: self.decomposition_passthrough_bound,
2061            composition_passthrough_bound: self.composition_passthrough_bound,
2062        }
2063    }
2064
2065    /// NFD constructor using compiled data.
2066    ///
2067    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2068    ///
2069    /// [📚 Help choosing a constructor](icu_provider::constructors)
2070    #[cfg(feature = "compiled_data")]
2071    pub const fn new_nfd() -> DecomposingNormalizerBorrowed<'static> {
2072        DecomposingNormalizerBorrowed::new_nfd()
2073    }
2074
2075    icu_provider::gen_buffer_data_constructors!(
2076        () -> error: DataError,
2077        functions: [
2078            new_nfd: skip,
2079            try_new_nfd_with_buffer_provider,
2080            try_new_nfd_unstable,
2081            Self,
2082        ]
2083    );
2084
2085    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfd)]
2086    pub fn try_new_nfd_unstable<D>(provider: &D) -> Result<Self, DataError>
2087    where
2088        D: DataProvider<NormalizerNfdDataV1> + DataProvider<NormalizerNfdTablesV1> + ?Sized,
2089    {
2090        let decompositions: DataPayload<NormalizerNfdDataV1> =
2091            provider.load(Default::default())?.payload;
2092        let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2093
2094        if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
2095            // The data is from a future where there exists a normalization flavor whose
2096            // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2097            // of space. If a good use case from such a decomposition flavor arises, we can
2098            // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2099            // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2100            // since for now the masks are hard-coded, error out.
2101            return Err(
2102                DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2103            );
2104        }
2105
2106        let cap = decompositions.get().passthrough_cap;
2107        if cap > 0x0300 {
2108            return Err(DataError::custom("invalid").with_marker(NormalizerNfdDataV1::INFO));
2109        }
2110        let decomposition_capped = cap.min(0xC0);
2111        let composition_capped = cap.min(0x0300);
2112
2113        Ok(DecomposingNormalizer {
2114            decompositions,
2115            tables,
2116            supplementary_tables: None,
2117            decomposition_passthrough_bound: decomposition_capped as u8,
2118            composition_passthrough_bound: composition_capped,
2119        })
2120    }
2121
2122    icu_provider::gen_buffer_data_constructors!(
2123        () -> error: DataError,
2124        functions: [
2125            new_nfkd: skip,
2126            try_new_nfkd_with_buffer_provider,
2127            try_new_nfkd_unstable,
2128            Self,
2129        ]
2130    );
2131
2132    /// NFKD constructor using compiled data.
2133    ///
2134    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2135    ///
2136    /// [📚 Help choosing a constructor](icu_provider::constructors)
2137    #[cfg(feature = "compiled_data")]
2138    pub const fn new_nfkd() -> DecomposingNormalizerBorrowed<'static> {
2139        DecomposingNormalizerBorrowed::new_nfkd()
2140    }
2141
2142    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkd)]
2143    pub fn try_new_nfkd_unstable<D>(provider: &D) -> Result<Self, DataError>
2144    where
2145        D: DataProvider<NormalizerNfkdDataV1>
2146            + DataProvider<NormalizerNfdTablesV1>
2147            + DataProvider<NormalizerNfkdTablesV1>
2148            + ?Sized,
2149    {
2150        let decompositions: DataPayload<NormalizerNfkdDataV1> =
2151            provider.load(Default::default())?.payload;
2152        let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2153        let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
2154            provider.load(Default::default())?.payload;
2155
2156        if tables.get().scalars16.len()
2157            + tables.get().scalars24.len()
2158            + supplementary_tables.get().scalars16.len()
2159            + supplementary_tables.get().scalars24.len()
2160            > 0xFFF
2161        {
2162            // The data is from a future where there exists a normalization flavor whose
2163            // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2164            // of space. If a good use case from such a decomposition flavor arises, we can
2165            // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2166            // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2167            // since for now the masks are hard-coded, error out.
2168            return Err(
2169                DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2170            );
2171        }
2172
2173        let cap = decompositions.get().passthrough_cap;
2174        if cap > 0x0300 {
2175            return Err(DataError::custom("invalid").with_marker(NormalizerNfkdDataV1::INFO));
2176        }
2177        let decomposition_capped = cap.min(0xC0);
2178        let composition_capped = cap.min(0x0300);
2179
2180        Ok(DecomposingNormalizer {
2181            decompositions: decompositions.cast(),
2182            tables,
2183            supplementary_tables: Some(supplementary_tables),
2184            decomposition_passthrough_bound: decomposition_capped as u8,
2185            composition_passthrough_bound: composition_capped,
2186        })
2187    }
2188
2189    /// UTS 46 decomposed constructor (testing only)
2190    ///
2191    /// This is a special building block normalization for IDNA. It is the decomposed counterpart of
2192    /// ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows and
2193    /// ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as in
2194    /// NFD in this normalization. In both cases, the previous UTS 46 processing before using
2195    /// normalization is expected to deal with these characters. Making the disallowed characters
2196    /// behave like this is beneficial to data size, and this normalizer implementation cannot
2197    /// deal with a character normalizing to the empty string, which doesn't happen in NFD or
2198    /// NFKD as of Unicode 14.
2199    ///
2200    /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2201    /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2202    /// U+0345 from a reordered character into a non-reordered character before reordering happens.
2203    /// Therefore, the output of this normalization may differ for different inputs that are
2204    /// canonically equivalent with each other if they differ by how U+0345 is ordered relative
2205    /// to other reorderable characters.
2206    pub(crate) fn try_new_uts46_decomposed_unstable<D>(provider: &D) -> Result<Self, DataError>
2207    where
2208        D: DataProvider<NormalizerUts46DataV1>
2209            + DataProvider<NormalizerNfdTablesV1>
2210            + DataProvider<NormalizerNfkdTablesV1>
2211            // UTS 46 tables merged into CompatibilityDecompositionTablesV1
2212            + ?Sized,
2213    {
2214        let decompositions: DataPayload<NormalizerUts46DataV1> =
2215            provider.load(Default::default())?.payload;
2216        let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2217        let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
2218            provider.load(Default::default())?.payload;
2219
2220        if tables.get().scalars16.len()
2221            + tables.get().scalars24.len()
2222            + supplementary_tables.get().scalars16.len()
2223            + supplementary_tables.get().scalars24.len()
2224            > 0xFFF
2225        {
2226            // The data is from a future where there exists a normalization flavor whose
2227            // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2228            // of space. If a good use case from such a decomposition flavor arises, we can
2229            // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2230            // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2231            // since for now the masks are hard-coded, error out.
2232            return Err(
2233                DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2234            );
2235        }
2236
2237        let cap = decompositions.get().passthrough_cap;
2238        if cap > 0x0300 {
2239            return Err(DataError::custom("invalid").with_marker(NormalizerUts46DataV1::INFO));
2240        }
2241        let decomposition_capped = cap.min(0xC0);
2242        let composition_capped = cap.min(0x0300);
2243
2244        Ok(DecomposingNormalizer {
2245            decompositions: decompositions.cast(),
2246            tables,
2247            supplementary_tables: Some(supplementary_tables),
2248            decomposition_passthrough_bound: decomposition_capped as u8,
2249            composition_passthrough_bound: composition_capped,
2250        })
2251    }
2252}
2253
2254/// Borrowed version of a normalizer for performing composing normalization.
2255#[derive(Debug)]
2256pub struct ComposingNormalizerBorrowed<'a> {
2257    decomposing_normalizer: DecomposingNormalizerBorrowed<'a>,
2258    canonical_compositions: &'a CanonicalCompositions<'a>,
2259}
2260
2261impl ComposingNormalizerBorrowed<'static> {
2262    /// Cheaply converts a [`ComposingNormalizerBorrowed<'static>`] into a [`ComposingNormalizer`].
2263    ///
2264    /// Note: Due to branching and indirection, using [`ComposingNormalizer`] might inhibit some
2265    /// compile-time optimizations that are possible with [`ComposingNormalizerBorrowed`].
2266    pub const fn static_to_owned(self) -> ComposingNormalizer {
2267        ComposingNormalizer {
2268            decomposing_normalizer: self.decomposing_normalizer.static_to_owned(),
2269            canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions),
2270        }
2271    }
2272
2273    /// NFC constructor using compiled data.
2274    ///
2275    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2276    ///
2277    /// [📚 Help choosing a constructor](icu_provider::constructors)
2278    #[cfg(feature = "compiled_data")]
2279    pub const fn new_nfc() -> Self {
2280        ComposingNormalizerBorrowed {
2281            decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfd(),
2282            canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2283        }
2284    }
2285
2286    /// NFKC constructor using compiled data.
2287    ///
2288    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2289    ///
2290    /// [📚 Help choosing a constructor](icu_provider::constructors)
2291    #[cfg(feature = "compiled_data")]
2292    pub const fn new_nfkc() -> Self {
2293        ComposingNormalizerBorrowed {
2294            decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfkd(),
2295            canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2296        }
2297    }
2298
2299    /// This is a special building block normalization for IDNA that implements parts of the Map
2300    /// step and the following Normalize step.
2301    ///
2302    /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2303    /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2304    /// U+0345 from a reordered character into a non-reordered character before reordering happens.
2305    /// Therefore, the output of this normalization may differ for different inputs that are
2306    /// canonically equivalents with each other if they differ by how U+0345 is ordered relative
2307    /// to other reorderable characters.
2308    #[cfg(feature = "compiled_data")]
2309    pub(crate) const fn new_uts46() -> Self {
2310        ComposingNormalizerBorrowed {
2311            decomposing_normalizer: DecomposingNormalizerBorrowed::new_uts46_decomposed(),
2312            canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2313        }
2314    }
2315}
2316
2317impl<'data> ComposingNormalizerBorrowed<'data> {
2318    /// Wraps a delegate iterator into a composing iterator
2319    /// adapter by using the data already held by this normalizer.
2320    pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<'data, I> {
2321        self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
2322    }
2323
2324    fn normalize_iter_private<I: Iterator<Item = char>>(
2325        &self,
2326        iter: I,
2327        ignorable_behavior: IgnorableBehavior,
2328    ) -> Composition<'data, I> {
2329        Composition::new(
2330            Decomposition::new_with_supplements(
2331                iter,
2332                self.decomposing_normalizer.decompositions,
2333                self.decomposing_normalizer.tables,
2334                self.decomposing_normalizer.supplementary_tables,
2335                self.decomposing_normalizer.decomposition_passthrough_bound,
2336                ignorable_behavior,
2337            ),
2338            self.canonical_compositions.canonical_compositions.clone(),
2339            self.decomposing_normalizer.composition_passthrough_bound,
2340        )
2341    }
2342
2343    normalizer_methods!();
2344
2345    composing_normalize_to!(
2346        /// Normalize a string slice into a `Write` sink.
2347        ,
2348        normalize_to,
2349        core::fmt::Write,
2350        &str,
2351        {},
2352        true,
2353        as_str,
2354        {
2355            // Let's hope LICM hoists this outside `'outer`.
2356            let composition_passthrough_byte_bound = if composition_passthrough_bound == 0x300 {
2357                0xCCu8
2358            } else {
2359                // We can make this fancy if a normalization other than NFC where looking at
2360                // non-ASCII lead bytes is worthwhile is ever introduced.
2361                composition_passthrough_bound.min(0x80) as u8
2362            };
2363            // Attributes have to be on blocks, so hoisting all the way here.
2364            #[allow(clippy::unwrap_used)]
2365            'fast: loop {
2366                let mut code_unit_iter = composition.decomposition.delegate.as_str().as_bytes().iter();
2367                'fastest: loop {
2368                    if let Some(&upcoming_byte) = code_unit_iter.next() {
2369                        if upcoming_byte < composition_passthrough_byte_bound {
2370                            // Fast-track succeeded!
2371                            continue 'fastest;
2372                        }
2373                        composition.decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
2374                        break 'fastest;
2375                    }
2376                    // End of stream
2377                    sink.write_str(pending_slice)?;
2378                    return Ok(());
2379                }
2380                // `unwrap()` OK, because the slice is valid UTF-8 and we know there
2381                // is an upcoming byte.
2382                let upcoming = composition.decomposition.delegate.next().unwrap();
2383                let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2384                if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2385                    // Can't combine backwards, hence a plain (non-backwards-combining)
2386                    // starter albeit past `composition_passthrough_bound`
2387
2388                    // Fast-track succeeded!
2389                    continue 'fast;
2390                }
2391                // We need to fall off the fast path.
2392                composition.decomposition.pending = Some(upcoming_with_trie_value);
2393
2394                // slicing and unwrap OK, because we've just evidently read enough previously.
2395                let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8()].chars();
2396                // `unwrap` OK, because we've previously manage to read the previous character
2397                undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2398                let consumed_so_far_slice = consumed_so_far.as_str();
2399                sink.write_str(consumed_so_far_slice)?;
2400                break 'fast;
2401            }
2402        },
2403        text,
2404        sink,
2405        composition,
2406        composition_passthrough_bound,
2407        undecomposed_starter,
2408        pending_slice,
2409        len_utf8,
2410    );
2411
2412    composing_normalize_to!(
2413        /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
2414        ///
2415        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
2416        /// according to the WHATWG Encoding Standard.
2417        ///
2418        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
2419        #[cfg(feature = "utf8_iter")]
2420        ,
2421        normalize_utf8_to,
2422        core::fmt::Write,
2423        &[u8],
2424        {},
2425        false,
2426        as_slice,
2427        {
2428            'fast: loop {
2429                if let Some(upcoming) = composition.decomposition.delegate.next() {
2430                    if u32::from(upcoming) < composition_passthrough_bound {
2431                        // Fast-track succeeded!
2432                        continue 'fast;
2433                    }
2434                    // TODO: Be statically aware of fast/small trie.
2435                    let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2436                    if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2437                        // Note: The trie value of the REPLACEMENT CHARACTER is
2438                        // intentionally formatted to fail the
2439                        // `potential_passthrough_and_cannot_combine_backwards`
2440                        // test even though it really is a starter that decomposes
2441                        // to self and cannot combine backwards. This
2442                        // Allows moving the branch on REPLACEMENT CHARACTER
2443                        // below this `continue`.
2444                        continue 'fast;
2445                    }
2446                    // We need to fall off the fast path.
2447
2448                    // TODO(#2006): Annotate as unlikely
2449                    if upcoming == REPLACEMENT_CHARACTER {
2450                        // Can't tell if this is an error or a literal U+FFFD in
2451                        // the input. Assuming the former to be sure.
2452
2453                        // Since the U+FFFD might signify an error, we can't
2454                        // assume `upcoming.len_utf8()` for the backoff length.
2455                        let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len()].chars();
2456                        let back = consumed_so_far.next_back();
2457                        debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
2458                        let consumed_so_far_slice = consumed_so_far.as_slice();
2459                        sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) })?;
2460                        undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
2461                        composition.decomposition.pending = None;
2462                        break 'fast;
2463                    }
2464
2465                    composition.decomposition.pending = Some(upcoming_with_trie_value);
2466                    // slicing and unwrap OK, because we've just evidently read enough previously.
2467                    // `unwrap` OK, because we've previously manage to read the previous character
2468                    let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8()].chars();
2469                    #[allow(clippy::unwrap_used)]
2470                    {
2471                        // TODO: If the previous character was below the passthrough bound,
2472                        // we really need to read from the trie. Otherwise, we could maintain
2473                        // the most-recent trie value. Need to measure what's more expensive:
2474                        // Remembering the trie value on each iteration or re-reading the
2475                        // last one after the fast-track run.
2476                        undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2477                    }
2478                    let consumed_so_far_slice = consumed_so_far.as_slice();
2479                    sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice)})?;
2480                    break 'fast;
2481                }
2482                // End of stream
2483                sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
2484                return Ok(());
2485            }
2486        },
2487        text,
2488        sink,
2489        composition,
2490        composition_passthrough_bound,
2491        undecomposed_starter,
2492        pending_slice,
2493        len_utf8,
2494    );
2495
2496    composing_normalize_to!(
2497        /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2498        ///
2499        /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2500        /// before normalizing.
2501        ///
2502        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
2503        #[cfg(feature = "utf16_iter")]
2504        ,
2505        normalize_utf16_to,
2506        write16::Write16,
2507        &[u16],
2508        {
2509            sink.size_hint(text.len())?;
2510        },
2511        false,
2512        as_slice,
2513        {
2514            let mut code_unit_iter = composition.decomposition.delegate.as_slice().iter();
2515            let mut upcoming32;
2516            // Declaring this up here is useful for getting compile errors about invalid changes
2517            // to the code structure below.
2518            let mut trie_value;
2519            'fast: loop {
2520                if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2521                    upcoming32 = u32::from(upcoming_code_unit); // may be surrogate
2522                    if upcoming32 < composition_passthrough_bound {
2523                        // No need for surrogate or U+FFFD check, because
2524                        // `composition_passthrough_bound` cannot be higher than
2525                        // U+0300.
2526                        // Fast-track succeeded!
2527                        // At this point, `trie_value` is out of sync with `upcoming32`.
2528                        // However, we either 1) reach the end of `code_unit_iter`, at
2529                        // which point nothing reads `trie_value` anymore or we
2530                        // execute the line immediately below this loop.
2531                        continue 'fast;
2532                    }
2533                    // We might be doing a trie lookup by surrogate. Surrogates get
2534                    // a decomposition to U+FFFD.
2535                    trie_value = composition.decomposition.trie.get32(upcoming32);
2536                    if potential_passthrough_and_cannot_combine_backwards_impl(trie_value) {
2537                        // Can't combine backwards, hence a plain (non-backwards-combining)
2538                        // starter albeit past `composition_passthrough_bound`
2539
2540                        // Fast-track succeeded!
2541                        continue 'fast;
2542                    }
2543
2544                    // We might now be looking at a surrogate.
2545                    // The loop is only broken out of as goto forward
2546                    #[allow(clippy::never_loop)]
2547                    'surrogateloop: loop {
2548                        let surrogate_base = upcoming32.wrapping_sub(0xD800);
2549                        if surrogate_base > (0xDFFF - 0xD800) {
2550                            // Not surrogate
2551                            break 'surrogateloop;
2552                        }
2553                        if surrogate_base <= (0xDBFF - 0xD800) {
2554                            let iter_backup = code_unit_iter.clone();
2555                            if let Some(&low) = code_unit_iter.next() {
2556                                if in_inclusive_range16(low, 0xDC00, 0xDFFF) {
2557                                    upcoming32 = (upcoming32 << 10) + u32::from(low)
2558                                        - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
2559                                    // Successfully-paired surrogate. Read from the trie again.
2560                                    trie_value = composition.decomposition.trie.get32(upcoming32);
2561                                    if potential_passthrough_and_cannot_combine_backwards_impl(trie_value) {
2562                                        // Fast-track succeeded!
2563                                        continue 'fast;
2564                                    }
2565                                    break 'surrogateloop;
2566                                } else {
2567                                    code_unit_iter = iter_backup;
2568                                }
2569                            }
2570                        }
2571                        // unpaired surrogate
2572                        upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
2573                        // trie_value already holds a decomposition to U+FFFD.
2574                        debug_assert_eq!(trie_value, NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER | 0xFFFD);
2575                        break 'surrogateloop;
2576                    }
2577
2578                    // SAFETY: upcoming32 can no longer be a surrogate.
2579                    let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2580                    let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
2581                    // We need to fall off the fast path.
2582                    composition.decomposition.pending = Some(upcoming_with_trie_value);
2583                    let mut consumed_so_far = pending_slice[..pending_slice.len() - code_unit_iter.as_slice().len() - upcoming.len_utf16()].chars();
2584                    // `unwrap` OK, because we've previously managed to read the previous character
2585                    #[allow(clippy::unwrap_used)]
2586                    {
2587                        // TODO: If the previous character was below the passthrough bound,
2588                        // we really need to read from the trie. Otherwise, we could maintain
2589                        // the most-recent trie value. Need to measure what's more expensive:
2590                        // Remembering the trie value on each iteration or re-reading the
2591                        // last one after the fast-track run.
2592                        undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2593                    }
2594                    let consumed_so_far_slice = consumed_so_far.as_slice();
2595                    sink.write_slice(consumed_so_far_slice)?;
2596                    break 'fast;
2597                }
2598                // End of stream
2599                sink.write_slice(pending_slice)?;
2600                return Ok(());
2601            }
2602            // Sync the main iterator
2603            composition.decomposition.delegate = code_unit_iter.as_slice().chars();
2604        },
2605        text,
2606        sink,
2607        composition,
2608        composition_passthrough_bound,
2609        undecomposed_starter,
2610        pending_slice,
2611        len_utf16,
2612    );
2613}
2614
2615/// A normalizer for performing composing normalization.
2616#[derive(Debug)]
2617pub struct ComposingNormalizer {
2618    decomposing_normalizer: DecomposingNormalizer,
2619    canonical_compositions: DataPayload<NormalizerNfcV1>,
2620}
2621
2622impl ComposingNormalizer {
2623    /// Constructs a borrowed version of this type for more efficient querying.
2624    pub fn as_borrowed(&self) -> ComposingNormalizerBorrowed<'_> {
2625        ComposingNormalizerBorrowed {
2626            decomposing_normalizer: self.decomposing_normalizer.as_borrowed(),
2627            canonical_compositions: self.canonical_compositions.get(),
2628        }
2629    }
2630
2631    /// NFC constructor using compiled data.
2632    ///
2633    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2634    ///
2635    /// [📚 Help choosing a constructor](icu_provider::constructors)
2636    #[cfg(feature = "compiled_data")]
2637    pub const fn new_nfc() -> ComposingNormalizerBorrowed<'static> {
2638        ComposingNormalizerBorrowed::new_nfc()
2639    }
2640
2641    icu_provider::gen_buffer_data_constructors!(
2642        () -> error: DataError,
2643        functions: [
2644            new_nfc: skip,
2645            try_new_nfc_with_buffer_provider,
2646            try_new_nfc_unstable,
2647            Self,
2648        ]
2649    );
2650
2651    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfc)]
2652    pub fn try_new_nfc_unstable<D>(provider: &D) -> Result<Self, DataError>
2653    where
2654        D: DataProvider<NormalizerNfdDataV1>
2655            + DataProvider<NormalizerNfdTablesV1>
2656            + DataProvider<NormalizerNfcV1>
2657            + ?Sized,
2658    {
2659        let decomposing_normalizer = DecomposingNormalizer::try_new_nfd_unstable(provider)?;
2660
2661        let canonical_compositions: DataPayload<NormalizerNfcV1> =
2662            provider.load(Default::default())?.payload;
2663
2664        Ok(ComposingNormalizer {
2665            decomposing_normalizer,
2666            canonical_compositions,
2667        })
2668    }
2669
2670    /// NFKC constructor using compiled data.
2671    ///
2672    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2673    ///
2674    /// [📚 Help choosing a constructor](icu_provider::constructors)
2675    #[cfg(feature = "compiled_data")]
2676    pub const fn new_nfkc() -> ComposingNormalizerBorrowed<'static> {
2677        ComposingNormalizerBorrowed::new_nfkc()
2678    }
2679
2680    icu_provider::gen_buffer_data_constructors!(
2681        () -> error: DataError,
2682        functions: [
2683            new_nfkc: skip,
2684            try_new_nfkc_with_buffer_provider,
2685            try_new_nfkc_unstable,
2686            Self,
2687        ]
2688    );
2689
2690    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkc)]
2691    pub fn try_new_nfkc_unstable<D>(provider: &D) -> Result<Self, DataError>
2692    where
2693        D: DataProvider<NormalizerNfkdDataV1>
2694            + DataProvider<NormalizerNfdTablesV1>
2695            + DataProvider<NormalizerNfkdTablesV1>
2696            + DataProvider<NormalizerNfcV1>
2697            + ?Sized,
2698    {
2699        let decomposing_normalizer = DecomposingNormalizer::try_new_nfkd_unstable(provider)?;
2700
2701        let canonical_compositions: DataPayload<NormalizerNfcV1> =
2702            provider.load(Default::default())?.payload;
2703
2704        Ok(ComposingNormalizer {
2705            decomposing_normalizer,
2706            canonical_compositions,
2707        })
2708    }
2709
2710    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
2711    pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, DataError>
2712    where
2713        D: DataProvider<NormalizerUts46DataV1>
2714            + DataProvider<NormalizerNfdTablesV1>
2715            + DataProvider<NormalizerNfkdTablesV1>
2716            // UTS 46 tables merged into CompatibilityDecompositionTablesV1
2717            + DataProvider<NormalizerNfcV1>
2718            + ?Sized,
2719    {
2720        let decomposing_normalizer =
2721            DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;
2722
2723        let canonical_compositions: DataPayload<NormalizerNfcV1> =
2724            provider.load(Default::default())?.payload;
2725
2726        Ok(ComposingNormalizer {
2727            decomposing_normalizer,
2728            canonical_compositions,
2729        })
2730    }
2731}
2732
2733#[cfg(feature = "utf16_iter")]
2734struct IsNormalizedSinkUtf16<'a> {
2735    expect: &'a [u16],
2736}
2737
2738#[cfg(feature = "utf16_iter")]
2739impl<'a> IsNormalizedSinkUtf16<'a> {
2740    pub fn new(slice: &'a [u16]) -> Self {
2741        IsNormalizedSinkUtf16 { expect: slice }
2742    }
2743    pub fn remaining_len(&self) -> usize {
2744        self.expect.len()
2745    }
2746}
2747
2748#[cfg(feature = "utf16_iter")]
2749impl write16::Write16 for IsNormalizedSinkUtf16<'_> {
2750    fn write_slice(&mut self, s: &[u16]) -> core::fmt::Result {
2751        // We know that if we get a slice, it's a pass-through,
2752        // so we can compare addresses. Indexing is OK, because
2753        // an indexing failure would be a code bug rather than
2754        // an input or data issue.
2755        #[allow(clippy::indexing_slicing)]
2756        if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
2757            self.expect = &self.expect[s.len()..];
2758            Ok(())
2759        } else {
2760            Err(core::fmt::Error {})
2761        }
2762    }
2763
2764    fn write_char(&mut self, c: char) -> core::fmt::Result {
2765        let mut iter = self.expect.chars();
2766        if iter.next() == Some(c) {
2767            self.expect = iter.as_slice();
2768            Ok(())
2769        } else {
2770            Err(core::fmt::Error {})
2771        }
2772    }
2773}
2774
2775#[cfg(feature = "utf8_iter")]
2776struct IsNormalizedSinkUtf8<'a> {
2777    expect: &'a [u8],
2778}
2779
2780#[cfg(feature = "utf8_iter")]
2781impl<'a> IsNormalizedSinkUtf8<'a> {
2782    pub fn new(slice: &'a [u8]) -> Self {
2783        IsNormalizedSinkUtf8 { expect: slice }
2784    }
2785    pub fn remaining_len(&self) -> usize {
2786        self.expect.len()
2787    }
2788}
2789
2790#[cfg(feature = "utf8_iter")]
2791impl core::fmt::Write for IsNormalizedSinkUtf8<'_> {
2792    fn write_str(&mut self, s: &str) -> core::fmt::Result {
2793        // We know that if we get a slice, it's a pass-through,
2794        // so we can compare addresses. Indexing is OK, because
2795        // an indexing failure would be a code bug rather than
2796        // an input or data issue.
2797        #[allow(clippy::indexing_slicing)]
2798        if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
2799            self.expect = &self.expect[s.len()..];
2800            Ok(())
2801        } else {
2802            Err(core::fmt::Error {})
2803        }
2804    }
2805
2806    fn write_char(&mut self, c: char) -> core::fmt::Result {
2807        let mut iter = self.expect.chars();
2808        if iter.next() == Some(c) {
2809            self.expect = iter.as_slice();
2810            Ok(())
2811        } else {
2812            Err(core::fmt::Error {})
2813        }
2814    }
2815}
2816
2817struct IsNormalizedSinkStr<'a> {
2818    expect: &'a str,
2819}
2820
2821impl<'a> IsNormalizedSinkStr<'a> {
2822    pub fn new(slice: &'a str) -> Self {
2823        IsNormalizedSinkStr { expect: slice }
2824    }
2825    pub fn remaining_len(&self) -> usize {
2826        self.expect.len()
2827    }
2828}
2829
2830impl core::fmt::Write for IsNormalizedSinkStr<'_> {
2831    fn write_str(&mut self, s: &str) -> core::fmt::Result {
2832        // We know that if we get a slice, it's a pass-through,
2833        // so we can compare addresses. Indexing is OK, because
2834        // an indexing failure would be a code bug rather than
2835        // an input or data issue.
2836        #[allow(clippy::indexing_slicing)]
2837        if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
2838            self.expect = &self.expect[s.len()..];
2839            Ok(())
2840        } else {
2841            Err(core::fmt::Error {})
2842        }
2843    }
2844
2845    fn write_char(&mut self, c: char) -> core::fmt::Result {
2846        let mut iter = self.expect.chars();
2847        if iter.next() == Some(c) {
2848            self.expect = iter.as_str();
2849            Ok(())
2850        } else {
2851            Err(core::fmt::Error {})
2852        }
2853    }
2854}