idna/
uts46.rs

1// Copyright The rust-url developers.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9//! This module provides the lower-level API for UTS 46.
10//!
11//! [`Uts46::process`] is the core that the other convenience
12//! methods build on.
13//!
14//! UTS 46 flags map to this API as follows:
15//!
16//! * _CheckHyphens_ - _true_: [`Hyphens::Check`], _false_: [`Hyphens::Allow`]; the WHATWG URL Standard sets this to _false_ for normal (non-conformance-checker) user agents.
17//! * _CheckBidi_ - Always _true_; cannot be configured, since this flag is _true_ even when WHATWG URL Standard _beStrict_ is _false_.
18//! * _CheckJoiners_ - Always _true_; cannot be configured, since this flag is _true_ even when WHATWG URL Standard _beStrict_ is _false_.
19//! * _UseSTD3ASCIIRules_ - _true_: [`AsciiDenyList::STD3`], _false_: [`AsciiDenyList::EMPTY`]; however, the check the WHATWG URL Standard performs right after the UTS 46 invocation corresponds to [`AsciiDenyList::URL`].
20//! * _Transitional_Processing_ - Always _false_ but could be implemented as a preprocessing step. This flag is deprecated and for Web purposes the transition is over in the sense that all of Firefox, Safari, or Chrome set this flag to _false_.
21//! * _VerifyDnsLength_ - _true_: [`DnsLength::Verify`], _false_: [`DnsLength::Ignore`]; the WHATWG URL Standard sets this to _false_ for normal (non-conformance-checker) user agents.
22//! * _IgnoreInvalidPunycode_ - Always _false_; cannot be configured. (Not yet covered by the WHATWG URL Standard, but 2 out of 3 major browser clearly behave as if this was _false_).
23
24use crate::punycode::Decoder;
25use crate::punycode::InternalCaller;
26use alloc::borrow::Cow;
27use alloc::string::String;
28use core::fmt::Write;
29use idna_adapter::*;
30use smallvec::SmallVec;
31use utf8_iter::Utf8CharsEx;
32
33/// ICU4C-compatible constraint.
34/// https://unicode-org.atlassian.net/browse/ICU-13727
35const PUNYCODE_DECODE_MAX_INPUT_LENGTH: usize = 2000;
36
37/// ICU4C-compatible constraint. (Note: ICU4C measures
38/// UTF-16 and we measure UTF-32. This means that we
39/// allow longer non-BMP inputs. For this implementation,
40/// the denial-of-service scaling does not depend on BMP vs.
41/// non-BMP: only the scalar values matter.)
42///
43/// https://unicode-org.atlassian.net/browse/ICU-13727
44const PUNYCODE_ENCODE_MAX_INPUT_LENGTH: usize = 1000;
45
46/// For keeping track of what kind of numerals have been
47/// seen in an RTL label.
48#[derive(Debug, PartialEq, Eq)]
49enum RtlNumeralState {
50    Undecided,
51    European,
52    Arabic,
53}
54
55/// Computes the mask for upper-case ASCII.
56const fn upper_case_mask() -> u128 {
57    let mut accu = 0u128;
58    let mut b = 0u8;
59    while b < 128 {
60        if (b >= b'A') && (b <= b'Z') {
61            accu |= 1u128 << b;
62        }
63        b += 1;
64    }
65    accu
66}
67
68/// Bit set for upper-case ASCII.
69const UPPER_CASE_MASK: u128 = upper_case_mask();
70
71/// Computes the mask for glyphless ASCII.
72const fn glyphless_mask() -> u128 {
73    let mut accu = 0u128;
74    let mut b = 0u8;
75    while b < 128 {
76        if (b <= b' ') || (b == 0x7F) {
77            accu |= 1u128 << b;
78        }
79        b += 1;
80    }
81    accu
82}
83
84/// Bit set for glyphless ASCII.
85const GLYPHLESS_MASK: u128 = glyphless_mask();
86
87/// The mask for the ASCII dot.
88const DOT_MASK: u128 = 1 << b'.';
89
90/// Computes the ASCII deny list for STD3 ASCII rules.
91const fn ldh_mask() -> u128 {
92    let mut accu = 0u128;
93    let mut b = 0u8;
94    while b < 128 {
95        if !((b >= b'a' && b <= b'z') || (b >= b'0' && b <= b'9') || b == b'-' || b == b'.') {
96            accu |= 1u128 << b;
97        }
98        b += 1;
99    }
100    accu
101}
102
103const PUNYCODE_PREFIX: u32 =
104    ((b'-' as u32) << 24) | ((b'-' as u32) << 16) | ((b'N' as u32) << 8) | b'X' as u32;
105
106const PUNYCODE_PREFIX_MASK: u32 = (0xFF << 24) | (0xFF << 16) | (0xDF << 8) | 0xDF;
107
108fn write_punycode_label<W: Write + ?Sized>(
109    label: &[char],
110    sink: &mut W,
111) -> Result<(), ProcessingError> {
112    sink.write_str("xn--")?;
113    crate::punycode::encode_into::<_, _, InternalCaller>(label.iter().copied(), sink)?;
114    Ok(())
115}
116
117#[inline(always)]
118fn has_punycode_prefix(slice: &[u8]) -> bool {
119    if slice.len() < 4 {
120        return false;
121    }
122    // Sadly, the optimizer doesn't figure out that more idiomatic code
123    // should compile to masking on 32-bit value.
124    let a = slice[0];
125    let b = slice[1];
126    let c = slice[2];
127    let d = slice[3];
128    let u = (u32::from(d) << 24) | (u32::from(c) << 16) | (u32::from(b) << 8) | u32::from(a);
129    (u & PUNYCODE_PREFIX_MASK) == PUNYCODE_PREFIX
130}
131
132#[inline(always)]
133fn in_inclusive_range8(u: u8, start: u8, end: u8) -> bool {
134    u.wrapping_sub(start) <= (end - start)
135}
136
137#[inline(always)]
138fn in_inclusive_range_char(c: char, start: char, end: char) -> bool {
139    u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
140}
141
142#[inline(always)]
143fn is_passthrough_ascii_label(label: &[u8]) -> bool {
144    // XXX if we aren't performing _CheckHyphens_, this could
145    // check for "xn--" and pass through YouTube CDN node names.
146    if label.len() >= 4 && label[2] == b'-' && label[3] == b'-' {
147        return false;
148    }
149    if let Some((&first, tail)) = label.split_first() {
150        // We need to check the first and last character
151        // more strictly in case this turns out to be a
152        // label in a bidi domain name. This has the side
153        // effect that this function only accepts labels
154        // that also conform to the STD3 rules.
155        //
156        // XXX: If we are in the fail-fast mode (i.e. we don't need
157        // to be able to overwrite anything with U+FFFD), we could
158        // merely record that we've seen a digit here and error out
159        // if we later discover that the domain name is a bidi
160        // domain name.
161        if !in_inclusive_range8(first, b'a', b'z') {
162            return false;
163        }
164        for &b in tail {
165            // If we used LDH_MASK, we'd have to check
166            // the bytes for the ASCII range anyhow.
167            if in_inclusive_range8(b, b'a', b'z') {
168                continue;
169            }
170            if in_inclusive_range8(b, b'0', b'9') {
171                continue;
172            }
173            if b == b'-' {
174                continue;
175            }
176            return false;
177        }
178        label.last() != Some(&b'-')
179    } else {
180        // empty
181        true
182    }
183}
184
185#[inline(always)]
186fn split_ascii_fast_path_prefix(label: &[u8]) -> (&[u8], &[u8]) {
187    if let Some(pos) = label.iter().position(|b| !b.is_ascii()) {
188        if pos == 0 {
189            // First is non-ASCII
190            (&[], label)
191        } else {
192            // Leave one ASCII character in the suffix
193            // in case it's a letter that a combining
194            // character combines with.
195            let (head, tail) = label.split_at(pos - 1);
196            (head, tail)
197        }
198    } else {
199        // All ASCII
200        (label, &[])
201    }
202}
203
204// Input known to be lower-case, but may contain non-ASCII.
205#[inline(always)]
206fn apply_ascii_deny_list_to_lower_cased_unicode(c: char, deny_list: u128) -> char {
207    if let Some(shifted) = 1u128.checked_shl(u32::from(c)) {
208        if (deny_list & shifted) == 0 {
209            c
210        } else {
211            '\u{FFFD}'
212        }
213    } else {
214        c
215    }
216}
217
218// Input known to be ASCII, but may contain upper case ASCII.
219#[inline(always)]
220fn apply_ascii_deny_list_to_potentially_upper_case_ascii(b: u8, deny_list: u128) -> char {
221    if (deny_list & (1u128 << b)) == 0 {
222        return char::from(b);
223    }
224    if in_inclusive_range8(b, b'A', b'Z') {
225        return char::from(b + 0x20);
226    }
227    '\u{FFFD}'
228}
229
230#[inline(always)]
231fn is_ascii(label: &[char]) -> bool {
232    for c in label.iter() {
233        if !c.is_ascii() {
234            return false;
235        }
236    }
237    true
238}
239
240#[derive(PartialEq, Eq, Copy, Clone)]
241enum PunycodeClassification {
242    Ascii,
243    Unicode,
244    Error,
245}
246
247#[inline(always)]
248fn classify_for_punycode(label: &[char]) -> PunycodeClassification {
249    let mut iter = label.iter().copied();
250    loop {
251        if let Some(c) = iter.next() {
252            if c.is_ascii() {
253                continue;
254            }
255            if c == '\u{FFFD}' {
256                return PunycodeClassification::Error;
257            }
258            for c in iter {
259                if c == '\u{FFFD}' {
260                    return PunycodeClassification::Error;
261                }
262            }
263            return PunycodeClassification::Unicode;
264        }
265        return PunycodeClassification::Ascii;
266    }
267}
268
269/// The ASCII deny list to be applied.
270#[derive(PartialEq, Eq, Copy, Clone)]
271#[repr(transparent)]
272pub struct AsciiDenyList {
273    bits: u128,
274}
275
276impl AsciiDenyList {
277    /// Computes (preferably at compile time) an ASCII deny list.
278    ///
279    /// Setting `deny_glyphless` to `true` denies U+0020 SPACE and below
280    /// as well as U+007F DELETE for convenience without having to list
281    /// these characters in the `deny_list` string.
282    ///
283    /// `deny_list` is the list of ASCII characters to deny. This
284    /// list must not contain any of:
285    /// * Letters
286    /// * Digits
287    /// * Hyphen
288    /// * Dot (period / full-stop)
289    /// * Non-ASCII
290    ///
291    /// # Panics
292    ///
293    /// If the deny list contains characters listed as prohibited above.
294    pub const fn new(deny_glyphless: bool, deny_list: &str) -> Self {
295        let mut bits = UPPER_CASE_MASK;
296        if deny_glyphless {
297            bits |= GLYPHLESS_MASK;
298        }
299        let mut i = 0;
300        let bytes = deny_list.as_bytes();
301        while i < bytes.len() {
302            let b = bytes[i];
303            assert!(b < 0x80, "ASCII deny list must be ASCII.");
304            // assert_ne not yet available in const context.
305            assert!(b != b'.', "ASCII deny list must not contain the dot.");
306            assert!(b != b'-', "ASCII deny list must not contain the hyphen.");
307            assert!(
308                !((b >= b'0') && (b <= b'9')),
309                "ASCII deny list must not contain digits."
310            );
311            assert!(
312                !((b >= b'a') && (b <= b'z')),
313                "ASCII deny list must not contain letters."
314            );
315            assert!(
316                !((b >= b'A') && (b <= b'Z')),
317                "ASCII deny list must not contain letters."
318            );
319            bits |= 1u128 << b;
320            i += 1;
321        }
322        Self { bits }
323    }
324
325    /// No ASCII deny list. This corresponds to _UseSTD3ASCIIRules=false_.
326    ///
327    /// Equivalent to `AsciiDenyList::new(false, "")`.
328    ///
329    /// Note: Not denying the space and control characters can result in
330    /// strange behavior. Without a deny list provided to the UTS 46
331    /// operation, the caller is expected perform filtering afterwards,
332    /// but it's more efficient to use `AsciiDenyList` than post-processing,
333    /// because the internals of this crate can optimize away checks in
334    /// certain cases.
335    pub const EMPTY: Self = Self::new(false, "");
336
337    /// The STD3 deny list. This corresponds to _UseSTD3ASCIIRules=true_.
338    ///
339    /// Note that this deny list rejects the underscore, which occurs in
340    /// pseudo-hosts used by various TXT record-based protocols, and also
341    /// characters that may occurs in non-DNS naming, such as NetBIOS.
342    pub const STD3: Self = Self { bits: ldh_mask() };
343
344    /// [Forbidden domain code point](https://url.spec.whatwg.org/#forbidden-domain-code-point) from the WHATWG URL Standard.
345    ///
346    /// Equivalent to `AsciiDenyList::new(true, "%#/:<>?@[\\]^|")`.
347    ///
348    /// Note that this deny list rejects IPv6 addresses, so (as in URL
349    /// parsing) you need to check for IPv6 addresses first and not
350    /// put them through UTS 46 processing.
351    pub const URL: Self = Self::new(true, "%#/:<>?@[\\]^|");
352}
353
354/// The _CheckHyphens_ mode.
355#[derive(PartialEq, Eq, Copy, Clone)]
356#[non_exhaustive] // non_exhaustive in case a middle mode that prohibits only first and last position needs to be added
357pub enum Hyphens {
358    /// _CheckHyphens=false_: Do not place positional restrictions on hyphens.
359    ///
360    /// This mode is used by the WHATWG URL Standard for normal User Agent processing
361    /// (i.e. not conformance checking).
362    Allow,
363
364    /// Prohibit hyphens in the first and last position in the label but allow in
365    /// the third and fourth position.
366    ///
367    /// Note that this mode rejects real-world names, including some GitHub user pages.
368    CheckFirstLast,
369
370    /// _CheckHyphens=true_: Prohibit hyphens in the first, third, fourth,
371    /// and last position in the label.
372    ///
373    /// Note that this mode rejects real-world names, including YouTube CDN nodes
374    /// and some GitHub user pages.
375    Check,
376}
377
378/// The UTS 46 _VerifyDNSLength_ flag.
379#[derive(PartialEq, Eq, Copy, Clone)]
380#[non_exhaustive]
381pub enum DnsLength {
382    /// _VerifyDNSLength=false_. (Possibly relevant for allowing non-DNS naming systems.)
383    Ignore,
384    /// _VerifyDNSLength=true_ with the exception that the trailing root label dot is
385    /// allowed.
386    VerifyAllowRootDot,
387    /// _VerifyDNSLength=true_. (The trailing root label dot is not allowed.)
388    Verify,
389}
390
391/// Policy for customizing behavior in case of an error.
392#[derive(PartialEq, Eq, Copy, Clone)]
393#[non_exhaustive]
394pub enum ErrorPolicy {
395    /// Return as early as possible without producing output in case of error.
396    FailFast,
397    /// In case of error, mark errors with the REPLACEMENT CHARACTER. (The output
398    /// containing REPLACEMENT CHARACTERs may be show to the user to illustrate
399    /// what was wrong but must not be used for naming in a network protocol.)
400    MarkErrors,
401}
402
403/// The success outcome of [`Uts46::process`]
404#[derive(PartialEq, Eq, Copy, Clone, Debug)]
405pub enum ProcessingSuccess {
406    /// There were no errors. The caller must consider the input to be the output.
407    ///
408    /// This asserts that the input can be safely passed to [`core::str::from_utf8_unchecked`].
409    ///
410    /// (Distinct from `WroteToSink` in order to allow `Cow` behavior to be implemented on top of
411    /// [`Uts46::process`].)
412    Passthrough,
413
414    /// There were no errors. The caller must consider what was written to the sink to be the output.
415    ///
416    /// (Distinct from `Passthrough` in order to allow `Cow` behavior to be implemented on top of
417    /// [`Uts46::process`].)
418    WroteToSink,
419}
420
421/// The failure outcome of [`Uts46::process`]
422#[derive(PartialEq, Eq, Copy, Clone, Debug)]
423pub enum ProcessingError {
424    /// There was a validity error according to the chosen options.
425    ///
426    /// In case of `Operation::ToAscii`, there is no output. Otherwise, output was written to the
427    /// sink and the output contains at least one U+FFFD REPLACEMENT CHARACTER to denote an error.
428    ValidityError,
429
430    /// The sink emitted [`core::fmt::Error`]. The partial output written to the sink must not
431    /// be used.
432    SinkError,
433}
434
435impl From<core::fmt::Error> for ProcessingError {
436    fn from(_: core::fmt::Error) -> Self {
437        Self::SinkError
438    }
439}
440
441impl From<crate::punycode::PunycodeEncodeError> for ProcessingError {
442    fn from(_: crate::punycode::PunycodeEncodeError) -> Self {
443        unreachable!(
444            "Punycode overflows should not be possible due to PUNYCODE_ENCODE_MAX_INPUT_LENGTH"
445        );
446    }
447}
448
449#[derive(Debug, Clone, Copy)]
450enum AlreadyAsciiLabel<'a> {
451    MixedCaseAscii(&'a [u8]),
452    MixedCasePunycode(&'a [u8]),
453    Other,
454}
455
456/// Performs the _VerifyDNSLength_ check on the output of the _ToASCII_ operation.
457///
458/// If the second argument is `false`, the trailing root label dot is allowed.
459///
460/// # Panics
461///
462/// Panics in debug mode if the argument isn't ASCII.
463pub fn verify_dns_length(domain_name: &str, allow_trailing_dot: bool) -> bool {
464    let bytes = domain_name.as_bytes();
465    debug_assert!(bytes.is_ascii());
466    let domain_name_without_trailing_dot = if let Some(without) = bytes.strip_suffix(b".") {
467        if !allow_trailing_dot {
468            return false;
469        }
470        without
471    } else {
472        bytes
473    };
474    if domain_name_without_trailing_dot.len() > 253 {
475        return false;
476    }
477    for label in domain_name_without_trailing_dot.split(|b| *b == b'.') {
478        if label.is_empty() {
479            return false;
480        }
481        if label.len() > 63 {
482            return false;
483        }
484    }
485    true
486}
487
488/// An implementation of UTS #46.
489pub struct Uts46 {
490    data: idna_adapter::Adapter,
491}
492
493#[cfg(feature = "compiled_data")]
494impl Default for Uts46 {
495    fn default() -> Self {
496        Self::new()
497    }
498}
499
500impl Uts46 {
501    /// Constructor using data compiled into the binary.
502    #[cfg(feature = "compiled_data")]
503    pub const fn new() -> Self {
504        Self {
505            data: idna_adapter::Adapter::new(),
506        }
507    }
508
509    // XXX Should there be an `icu_provider` feature for enabling
510    // a constructor for run-time data loading?
511
512    /// Performs the [ToASCII](https://www.unicode.org/reports/tr46/#ToASCII) operation
513    /// from UTS #46 with the options indicated.
514    ///
515    /// # Arguments
516    ///
517    /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by
518    ///   this method and input that is not well-formed UTF-8 is treated as an error. If you
519    ///   already have a `&str`, call `.as_bytes()` on it.)
520    /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46
521    ///   _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
522    ///   processing is handled via this argument. Most callers are probably the best off
523    ///   by using [`AsciiDenyList::URL`] here.
524    /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best
525    ///   off by using [`Hyphens::Allow`] here.
526    /// * `dns_length` - The UTS 46 _VerifyDNSLength_ flag.
527    pub fn to_ascii<'a>(
528        &self,
529        domain_name: &'a [u8],
530        ascii_deny_list: AsciiDenyList,
531        hyphens: Hyphens,
532        dns_length: DnsLength,
533    ) -> Result<Cow<'a, str>, crate::Errors> {
534        self.to_ascii_from_cow(
535            Cow::Borrowed(domain_name),
536            ascii_deny_list,
537            hyphens,
538            dns_length,
539        )
540    }
541
542    pub(crate) fn to_ascii_from_cow<'a>(
543        &self,
544        domain_name: Cow<'a, [u8]>,
545        ascii_deny_list: AsciiDenyList,
546        hyphens: Hyphens,
547        dns_length: DnsLength,
548    ) -> Result<Cow<'a, str>, crate::Errors> {
549        let mut s = String::new();
550        match self.process(
551            &domain_name,
552            ascii_deny_list,
553            hyphens,
554            ErrorPolicy::FailFast,
555            |_, _, _| false,
556            &mut s,
557            None,
558        ) {
559            Ok(ProcessingSuccess::Passthrough) => {
560                // SAFETY: `ProcessingSuccess::Passthrough` asserts that `domain_name` is ASCII.
561                let cow = match domain_name {
562                    Cow::Borrowed(v) => Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(v) }),
563                    Cow::Owned(v) => Cow::Owned(unsafe { String::from_utf8_unchecked(v) }),
564                };
565                if dns_length != DnsLength::Ignore
566                    && !verify_dns_length(&cow, dns_length == DnsLength::VerifyAllowRootDot)
567                {
568                    Err(crate::Errors::default())
569                } else {
570                    Ok(cow)
571                }
572            }
573            Ok(ProcessingSuccess::WroteToSink) => {
574                let cow: Cow<'_, str> = Cow::Owned(s);
575                if dns_length != DnsLength::Ignore
576                    && !verify_dns_length(&cow, dns_length == DnsLength::VerifyAllowRootDot)
577                {
578                    Err(crate::Errors::default())
579                } else {
580                    Ok(cow)
581                }
582            }
583            Err(ProcessingError::ValidityError) => Err(crate::Errors::default()),
584            Err(ProcessingError::SinkError) => unreachable!(),
585        }
586    }
587
588    /// Performs the [ToUnicode](https://www.unicode.org/reports/tr46/#ToUnicode) operation
589    /// from UTS #46 according to the options given. When there
590    /// are errors, there is still output, which may be rendered user, even through
591    /// the output must not be used in networking protocols. Errors are denoted
592    /// by U+FFFD REPLACEMENT CHARACTERs in the output. (That is, if the second item of the
593    /// return tuple is `Err`, the first item of the return tuple is guaranteed to contain
594    /// at least one U+FFFD.)
595    ///
596    /// Most applications probably shouldn't use this method and should be using
597    /// [`Uts46::to_user_interface`] instead.
598    ///
599    /// # Arguments
600    ///
601    /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by
602    ///   this method and input that is not well-formed UTF-8 is treated as an error. If you
603    ///   already have a `&str`, call `.as_bytes()` on it.)
604    /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46
605    ///   _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
606    ///   processing is handled via this argument. Most callers are probably the best off
607    ///   by using [`AsciiDenyList::URL`] here.
608    /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best
609    ///   off by using [`Hyphens::Allow`] here.
610    pub fn to_unicode<'a>(
611        &self,
612        domain_name: &'a [u8],
613        ascii_deny_list: AsciiDenyList,
614        hyphens: Hyphens,
615    ) -> (Cow<'a, str>, Result<(), crate::Errors>) {
616        self.to_user_interface(domain_name, ascii_deny_list, hyphens, |_, _, _| true)
617    }
618
619    /// Performs the [ToUnicode](https://www.unicode.org/reports/tr46/#ToUnicode) operation
620    /// from UTS #46 according to options given with some
621    /// error-free Unicode labels output according to
622    /// [ToASCII](https://www.unicode.org/reports/tr46/#ToASCII) instead as decided by
623    /// application policy implemented via the `output_as_unicode` closure. The purpose
624    /// is to convert user-visible domains to the Unicode form in general but to render
625    /// potentially misleading labels as Punycode.
626    ///
627    /// This is an imperfect security mechanism, because [the Punycode form itself may be
628    /// resemble a user-recognizable name](https://www.unicode.org/reports/tr36/#TablePunycodeSpoofing).
629    /// However, since this mechanism is common practice, this API provides support for The
630    /// the mechanism.
631    ///
632    /// ASCII labels always pass through as ASCII and labels with errors always pass through
633    /// as Unicode. For non-erroneous labels that contain at least one non-ASCII character
634    /// (implies non-empty), `output_as_unicode` is called with the Unicode form of the label,
635    /// the TLD (potentially empty), and a flag indicating whether the domain name as a whole
636    /// is a bidi domain name. If the return value is `true`, the label passes through as
637    /// Unicode. If the return value is `false`, the label is converted to Punycode.
638    ///
639    /// When there are errors, there is still output, which may be rendered user, even through
640    /// the output must not be used in networking protocols. Errors are denoted by
641    /// U+FFFD REPLACEMENT CHARACTERs in the output. (That is, if the second item
642    /// of the return tuple is `Err`, the first item of the return tuple is guaranteed to contain
643    /// at least one U+FFFD.) Labels that contain errors are not converted to Punycode.
644    ///
645    /// # Arguments
646    ///
647    /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by
648    ///   this method and input that is not well-formed UTF-8 is treated as an error. If you
649    ///   already have a `&str`, call `.as_bytes()` on it.)
650    /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46
651    ///   _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
652    ///   processing is handled via this argument. Most callers are probably the best off
653    ///   by using [`AsciiDenyList::URL`] here.
654    /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best
655    ///   off by using [`Hyphens::Allow`] here.
656    /// * `output_as_unicode` - A closure for deciding if a label should be output as Unicode
657    ///   (as opposed to Punycode). The first argument is the label for which a decision is
658    ///   needed (always non-empty slice). The second argument is the TLD (potentially empty).
659    ///   The third argument is `true` iff the domain name as a whole is a bidi domain name.
660    ///   Only non-erroneous labels that contain at least one non-ASCII character are passed
661    ///   to the closure as the first argument. The second and third argument values are
662    ///   guaranteed to remain the same during a single call to `process`, and the closure
663    ///   may cache computations derived from the second and third argument (hence the
664    ///   `FnMut` type).
665    pub fn to_user_interface<'a, OutputUnicode: FnMut(&[char], &[char], bool) -> bool>(
666        &self,
667        domain_name: &'a [u8],
668        ascii_deny_list: AsciiDenyList,
669        hyphens: Hyphens,
670        output_as_unicode: OutputUnicode,
671    ) -> (Cow<'a, str>, Result<(), crate::Errors>) {
672        let mut s = String::new();
673        match self.process(
674            domain_name,
675            ascii_deny_list,
676            hyphens,
677            ErrorPolicy::MarkErrors,
678            output_as_unicode,
679            &mut s,
680            None,
681        ) {
682            // SAFETY: `ProcessingSuccess::Passthrough` asserts that `domain_name` is ASCII.
683            Ok(ProcessingSuccess::Passthrough) => (
684                Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(domain_name) }),
685                Ok(()),
686            ),
687            Ok(ProcessingSuccess::WroteToSink) => (Cow::Owned(s), Ok(())),
688            Err(ProcessingError::ValidityError) => (Cow::Owned(s), Err(crate::Errors::default())),
689            Err(ProcessingError::SinkError) => unreachable!(),
690        }
691    }
692
693    /// The lower-level function that [`Uts46::to_ascii`], [`Uts46::to_unicode`], and
694    /// [`Uts46::to_user_interface`] are built on to allow support for output types other
695    /// than `Cow<'a, str>` (e.g. string types in a non-Rust programming language).
696    ///
697    /// # Arguments
698    ///
699    /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by
700    ///   this method and input that is not well-formed UTF-8 is treated as an error. If you
701    ///   already have a `&str`, call `.as_bytes()` on it.)
702    /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46
703    ///   _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
704    ///   processing is handled via this argument. Most callers are probably the best off
705    ///   by using [`AsciiDenyList::URL`] here.
706    /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best
707    ///   off by using [`Hyphens::Allow`] here.
708    /// * `error_policy` - Whether to fail fast or to produce output that may be rendered
709    ///   for the user to examine in case of errors.
710    /// * `output_as_unicode` - A closure for deciding if a label should be output as Unicode
711    ///   (as opposed to Punycode). The first argument is the label for which a decision is
712    ///   needed (always non-empty slice). The second argument is the TLD (potentially empty).
713    ///   The third argument is `true` iff the domain name as a whole is a bidi domain name.
714    ///   Only non-erroneous labels that contain at least one non-ASCII character are passed
715    ///   to the closure as the first argument. The second and third argument values are
716    ///   guaranteed to remain the same during a single call to `process`, and the closure
717    ///   may cache computations derived from the second and third argument (hence the
718    ///   `FnMut` type). To perform the _ToASCII_ operation, `|_, _, _| false` must be
719    ///   passed as the closure. To perform the _ToUnicode_ operation, `|_, _, _| true` must
720    ///   be passed as the closure. A more complex closure may be used to prepare a domain
721    ///   name for display in a user interface so that labels are converted to the Unicode
722    ///   form in general but potentially misleading labels are converted to the Punycode
723    ///   form.
724    /// * `sink` - The object that receives the output (in the non-passthrough case).
725    /// * `ascii_sink` - A second sink that receives the _ToASCII_ form only if there
726    ///   were no errors and `sink` received at least one character of non-ASCII output.
727    ///   The purpose of this argument is to enable a user interface display form of the
728    ///   domain and the _ToASCII_ form of the domain to be computed efficiently together.
729    ///   This argument is useless when `output_as_unicode` always returns `false`, in
730    ///   which case the _ToASCII_ form ends up in `sink` already. If `ascii_sink` receives
731    ///   no output and the return value is `Ok(ProcessingSuccess::WroteToSink)`, use the
732    ///   output received by `sink` also as the _ToASCII_ result.
733    ///
734    /// # Return value
735    ///
736    /// * `Ok(ProcessingSuccess::Passthrough)` - The caller must treat
737    ///   `unsafe { core::str::from_utf8_unchecked(domain_name) }` as the output. (This
738    ///   return value asserts that calling `core::str::from_utf8_unchecked(domain_name)`
739    ///   is safe.)
740    /// * `Ok(ProcessingSuccess::WroteToSink)` - The caller must treat was was written
741    ///   to `sink` as the output. If another sink was passed as `ascii_sink` but it did
742    ///   not receive output, the caller must treat what was written to `sink` also as
743    ///   the _ToASCII_ output. Otherwise, if `ascii_sink` received output, the caller
744    ///   must treat what was written to `ascii_sink` as the _ToASCII_ output.
745    /// * `Err(ProcessingError::ValidityError)` - The input was in error and must
746    ///   not be used for DNS lookup or otherwise in a network protocol. If `error_policy`
747    ///   was `ErrorPolicy::MarkErrors`, the output written to `sink` may be displayed
748    ///   to the user as an illustration of where the error was or the errors were.
749    /// * `Err(ProcessingError::SinkError)` - Either `sink` or `ascii_sink` returned
750    ///   [`core::fmt::Error`]. The partial output written to `sink` `ascii_sink` must not
751    ///   be used. If `W` never returns [`core::fmt::Error`], this method never returns
752    ///   `Err(ProcessingError::SinkError)`.
753    ///
754    /// # Safety-usable invariant
755    ///
756    /// If the return value is `Ok(ProcessingSuccess::Passthrough)`, `domain_name` is
757    /// ASCII and `core::str::from_utf8_unchecked(domain_name)` is safe. (Note:
758    /// Other return values do _not_ imply that `domain_name` wasn't ASCII!)
759    ///
760    /// # Security considerations
761    ///
762    /// Showing labels whose Unicode form might mislead the user as Punycode instead is
763    /// an imperfect security mechanism, because [the Punycode form itself may be resemble
764    /// a user-recognizable name](https://www.unicode.org/reports/tr36/#TablePunycodeSpoofing).
765    /// However, since this mechanism is common practice, this API provides support for the
766    /// the mechanism.
767    ///
768    /// Punycode processing is quadratic, so to avoid denial of service, this method imposes
769    /// length limits on Punycode treating especially long inputs as being in error. These
770    /// limits are well higher than the DNS length limits and are not more restrictive than
771    /// the limits imposed by ICU4C.
772    #[allow(clippy::too_many_arguments)]
773    pub fn process<W: Write + ?Sized, OutputUnicode: FnMut(&[char], &[char], bool) -> bool>(
774        &self,
775        domain_name: &[u8],
776        ascii_deny_list: AsciiDenyList,
777        hyphens: Hyphens,
778        error_policy: ErrorPolicy,
779        mut output_as_unicode: OutputUnicode,
780        sink: &mut W,
781        ascii_sink: Option<&mut W>,
782    ) -> Result<ProcessingSuccess, ProcessingError> {
783        let fail_fast = error_policy == ErrorPolicy::FailFast;
784        let mut domain_buffer = SmallVec::<[char; 253]>::new();
785        let mut already_punycode = SmallVec::<[AlreadyAsciiLabel; 8]>::new();
786        // `process_inner` could be pasted inline here, but it's out of line in order
787        // to avoid duplicating that code when monomorphizing over `W` and `OutputUnicode`.
788        let (passthrough_up_to, is_bidi, had_errors) = self.process_inner(
789            domain_name,
790            ascii_deny_list,
791            hyphens,
792            fail_fast,
793            &mut domain_buffer,
794            &mut already_punycode,
795        );
796        if passthrough_up_to == domain_name.len() {
797            debug_assert!(!had_errors);
798            return Ok(ProcessingSuccess::Passthrough);
799        }
800        // Checked only after passthrough as a micro optimization.
801        if fail_fast && had_errors {
802            return Err(ProcessingError::ValidityError);
803        }
804        debug_assert_eq!(had_errors, domain_buffer.contains(&'\u{FFFD}'));
805        let without_dot = if let Some(without_dot) = domain_buffer.strip_suffix(&['.']) {
806            without_dot
807        } else {
808            &domain_buffer[..]
809        };
810        // unwrap is OK, because we always have at least one label
811        let tld = without_dot.rsplit(|c| *c == '.').next().unwrap();
812        let mut had_unicode_output = false;
813        let mut seen_label = false;
814        let mut already_punycode_iter = already_punycode.iter();
815        let mut passthrough_up_to_extended = passthrough_up_to;
816        let mut flushed_prefix = false;
817        for label in domain_buffer.split(|c| *c == '.') {
818            // Unwrap is OK, because there are supposed to be as many items in
819            // `already_punycode` as there are labels.
820            let input_punycode = *already_punycode_iter.next().unwrap();
821            if seen_label {
822                if flushed_prefix {
823                    sink.write_char('.')?;
824                } else {
825                    debug_assert_eq!(domain_name[passthrough_up_to_extended], b'.');
826                    passthrough_up_to_extended += 1;
827                    if passthrough_up_to_extended == domain_name.len() {
828                        debug_assert!(!had_errors);
829                        return Ok(ProcessingSuccess::Passthrough);
830                    }
831                }
832            }
833            seen_label = true;
834
835            if let AlreadyAsciiLabel::MixedCaseAscii(mixed_case) = input_punycode {
836                if let Some(first_upper_case) =
837                    mixed_case.iter().position(|c| c.is_ascii_uppercase())
838                {
839                    let (head, tail) = mixed_case.split_at(first_upper_case);
840                    let slice_to_write = if flushed_prefix {
841                        head
842                    } else {
843                        flushed_prefix = true;
844                        passthrough_up_to_extended += head.len();
845                        debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
846                        &domain_name[..passthrough_up_to_extended]
847                    };
848                    // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
849                    sink.write_str(unsafe { core::str::from_utf8_unchecked(slice_to_write) })?;
850                    for c in tail.iter() {
851                        sink.write_char(char::from(c.to_ascii_lowercase()))?;
852                    }
853                } else if flushed_prefix {
854                    // SAFETY: `mixed_case` is known to be ASCII.
855                    sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
856                } else {
857                    passthrough_up_to_extended += mixed_case.len();
858                    if passthrough_up_to_extended == domain_name.len() {
859                        debug_assert!(!had_errors);
860                        return Ok(ProcessingSuccess::Passthrough);
861                    }
862                }
863                continue;
864            }
865
866            let potentially_punycode = if fail_fast {
867                debug_assert!(classify_for_punycode(label) != PunycodeClassification::Error);
868                !is_ascii(label)
869            } else {
870                classify_for_punycode(label) == PunycodeClassification::Unicode
871            };
872            let passthrough = if potentially_punycode {
873                let unicode = output_as_unicode(label, tld, is_bidi);
874                had_unicode_output |= unicode;
875                unicode
876            } else {
877                true
878            };
879            if passthrough {
880                if !flushed_prefix {
881                    flushed_prefix = true;
882                    // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
883                    sink.write_str(unsafe {
884                        core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended])
885                    })?;
886                }
887                for c in label.iter().copied() {
888                    sink.write_char(c)?;
889                }
890            } else if let AlreadyAsciiLabel::MixedCasePunycode(mixed_case) = input_punycode {
891                if let Some(first_upper_case) =
892                    mixed_case.iter().position(|c| c.is_ascii_uppercase())
893                {
894                    let (head, tail) = mixed_case.split_at(first_upper_case);
895                    let slice_to_write = if flushed_prefix {
896                        head
897                    } else {
898                        flushed_prefix = true;
899                        passthrough_up_to_extended += head.len();
900                        debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
901                        &domain_name[..passthrough_up_to_extended]
902                    };
903                    // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
904                    sink.write_str(unsafe { core::str::from_utf8_unchecked(slice_to_write) })?;
905                    for c in tail.iter() {
906                        sink.write_char(char::from(c.to_ascii_lowercase()))?;
907                    }
908                } else if flushed_prefix {
909                    // SAFETY: `mixed_case` is known to be ASCII.
910                    sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
911                } else {
912                    passthrough_up_to_extended += mixed_case.len();
913                    if passthrough_up_to_extended == domain_name.len() {
914                        debug_assert!(!had_errors);
915                        return Ok(ProcessingSuccess::Passthrough);
916                    }
917                }
918            } else {
919                if !flushed_prefix {
920                    flushed_prefix = true;
921                    // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
922                    sink.write_str(unsafe {
923                        core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended])
924                    })?;
925                }
926                write_punycode_label(label, sink)?;
927            }
928        }
929
930        if had_errors {
931            return Err(ProcessingError::ValidityError);
932        }
933
934        if had_unicode_output {
935            if let Some(sink) = ascii_sink {
936                let mut seen_label = false;
937                let mut already_punycode_iter = already_punycode.iter();
938                let mut passthrough_up_to_extended = passthrough_up_to;
939                let mut flushed_prefix = false;
940                for label in domain_buffer.split(|c| *c == '.') {
941                    // Unwrap is OK, because there are supposed to be as many items in
942                    // `already_punycode` as there are labels.
943                    let input_punycode = *already_punycode_iter.next().unwrap();
944                    if seen_label {
945                        if flushed_prefix {
946                            sink.write_char('.')?;
947                        } else {
948                            debug_assert_eq!(domain_name[passthrough_up_to_extended], b'.');
949                            passthrough_up_to_extended += 1;
950                        }
951                    }
952                    seen_label = true;
953
954                    if let AlreadyAsciiLabel::MixedCaseAscii(mixed_case) = input_punycode {
955                        if let Some(first_upper_case) =
956                            mixed_case.iter().position(|c| c.is_ascii_uppercase())
957                        {
958                            let (head, tail) = mixed_case.split_at(first_upper_case);
959                            let slice_to_write = if flushed_prefix {
960                                head
961                            } else {
962                                flushed_prefix = true;
963                                passthrough_up_to_extended += head.len();
964                                debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
965                                &domain_name[..passthrough_up_to_extended]
966                            };
967                            // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
968                            sink.write_str(unsafe {
969                                core::str::from_utf8_unchecked(slice_to_write)
970                            })?;
971                            for c in tail.iter() {
972                                sink.write_char(char::from(c.to_ascii_lowercase()))?;
973                            }
974                        } else if flushed_prefix {
975                            // SAFETY: `mixed_case` is known to be ASCII.
976                            sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
977                        } else {
978                            passthrough_up_to_extended += mixed_case.len();
979                        }
980                        continue;
981                    }
982
983                    if is_ascii(label) {
984                        if !flushed_prefix {
985                            flushed_prefix = true;
986                            // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
987                            sink.write_str(unsafe {
988                                core::str::from_utf8_unchecked(
989                                    &domain_name[..passthrough_up_to_extended],
990                                )
991                            })?;
992                        }
993                        for c in label.iter().copied() {
994                            sink.write_char(c)?;
995                        }
996                    } else if let AlreadyAsciiLabel::MixedCasePunycode(mixed_case) = input_punycode
997                    {
998                        if let Some(first_upper_case) =
999                            mixed_case.iter().position(|c| c.is_ascii_uppercase())
1000                        {
1001                            let (head, tail) = mixed_case.split_at(first_upper_case);
1002                            let slice_to_write = if flushed_prefix {
1003                                head
1004                            } else {
1005                                flushed_prefix = true;
1006                                passthrough_up_to_extended += head.len();
1007                                debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
1008                                &domain_name[..passthrough_up_to_extended]
1009                            };
1010                            // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
1011                            sink.write_str(unsafe {
1012                                core::str::from_utf8_unchecked(slice_to_write)
1013                            })?;
1014                            for c in tail.iter() {
1015                                sink.write_char(char::from(c.to_ascii_lowercase()))?;
1016                            }
1017                        } else if flushed_prefix {
1018                            // SAFETY: `mixed_case` is known to be ASCII.
1019                            sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
1020                        } else {
1021                            passthrough_up_to_extended += mixed_case.len();
1022                        }
1023                    } else {
1024                        if !flushed_prefix {
1025                            flushed_prefix = true;
1026                            // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
1027                            sink.write_str(unsafe {
1028                                core::str::from_utf8_unchecked(
1029                                    &domain_name[..passthrough_up_to_extended],
1030                                )
1031                            })?;
1032                        }
1033                        write_punycode_label(label, sink)?;
1034                    }
1035                }
1036                if !flushed_prefix {
1037                    // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
1038                    sink.write_str(unsafe {
1039                        core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended])
1040                    })?;
1041                }
1042            }
1043        }
1044        Ok(ProcessingSuccess::WroteToSink)
1045    }
1046
1047    /// The part of `process` that doesn't need to be generic over the sink.
1048    #[inline(always)]
1049    fn process_inner<'a>(
1050        &self,
1051        domain_name: &'a [u8],
1052        ascii_deny_list: AsciiDenyList,
1053        hyphens: Hyphens,
1054        fail_fast: bool,
1055        domain_buffer: &mut SmallVec<[char; 253]>,
1056        already_punycode: &mut SmallVec<[AlreadyAsciiLabel<'a>; 8]>,
1057    ) -> (usize, bool, bool) {
1058        // Sadly, this even faster-path ASCII tier is needed to avoid regressing
1059        // performance.
1060        let mut iter = domain_name.iter();
1061        let mut most_recent_label_start = iter.clone();
1062        loop {
1063            if let Some(&b) = iter.next() {
1064                if in_inclusive_range8(b, b'a', b'z') {
1065                    continue;
1066                }
1067                if b == b'.' {
1068                    most_recent_label_start = iter.clone();
1069                    continue;
1070                }
1071                return self.process_innermost(
1072                    domain_name,
1073                    ascii_deny_list,
1074                    hyphens,
1075                    fail_fast,
1076                    domain_buffer,
1077                    already_punycode,
1078                    most_recent_label_start.as_slice(),
1079                );
1080            } else {
1081                // Success! The whole input passes through on the fastest path!
1082                return (domain_name.len(), false, false);
1083            }
1084        }
1085    }
1086
1087    /// The part of `process` that doesn't need to be generic over the sink and
1088    /// can avoid monomorphizing in the interest of code size.
1089    /// Separating this into a different stack frame compared to `process_inner`
1090    /// improves performance in the ICU4X case.
1091    #[allow(clippy::too_many_arguments)]
1092    #[inline(never)]
1093    fn process_innermost<'a>(
1094        &self,
1095        domain_name: &'a [u8],
1096        ascii_deny_list: AsciiDenyList,
1097        hyphens: Hyphens,
1098        fail_fast: bool,
1099        domain_buffer: &mut SmallVec<[char; 253]>,
1100        already_punycode: &mut SmallVec<[AlreadyAsciiLabel<'a>; 8]>,
1101        tail: &'a [u8],
1102    ) -> (usize, bool, bool) {
1103        let deny_list = ascii_deny_list.bits;
1104        let deny_list_deny_dot = deny_list | DOT_MASK;
1105
1106        let mut had_errors = false;
1107
1108        let mut passthrough_up_to = domain_name.len() - tail.len(); // Index into `domain_name`
1109                                                                    // 253 ASCII characters is the max length for a valid domain name
1110                                                                    // (excluding the root dot).
1111        let mut current_label_start; // Index into `domain_buffer`
1112        let mut seen_label = false;
1113        let mut in_prefix = true;
1114        for label in tail.split(|b| *b == b'.') {
1115            // We check for passthrough only for the prefix. That is, if we
1116            // haven't moved on and started filling `domain_buffer`. Keeping
1117            // this stuff in one loop where the first items keep being skipped
1118            // once they have been skipped at least once instead of working
1119            // this into a fancier loop structure in order to make sure that
1120            // no item from the iterator is lost or processed twice.
1121            // Furthermore, after the passthrough fails, restarting the
1122            // normalization process after each pre-existing ASCII dot also
1123            // provides an opportunity for the processing to get back onto
1124            // an ASCII fast path that bypasses the normalizer for ASCII
1125            // after a pre-existing ASCII dot (pre-existing in the sense
1126            // of not coming from e.g. normalizing an ideographic dot).
1127            if in_prefix && is_passthrough_ascii_label(label) {
1128                if seen_label {
1129                    debug_assert_eq!(domain_name[passthrough_up_to], b'.');
1130                    passthrough_up_to += 1;
1131                }
1132                seen_label = true;
1133
1134                passthrough_up_to += label.len();
1135                continue;
1136            }
1137            if seen_label {
1138                if in_prefix {
1139                    debug_assert_eq!(domain_name[passthrough_up_to], b'.');
1140                    passthrough_up_to += 1;
1141                } else {
1142                    domain_buffer.push('.');
1143                }
1144            }
1145            seen_label = true;
1146            in_prefix = false;
1147            current_label_start = domain_buffer.len();
1148            if !label.is_empty() {
1149                let (ascii, non_ascii) = split_ascii_fast_path_prefix(label);
1150                let non_punycode_ascii_label = if non_ascii.is_empty() {
1151                    if has_punycode_prefix(ascii) {
1152                        if (ascii.last() != Some(&b'-'))
1153                            && (ascii.len() - 4 <= PUNYCODE_DECODE_MAX_INPUT_LENGTH)
1154                        {
1155                            if let Ok(decode) =
1156                                Decoder::default().decode::<u8, InternalCaller>(&ascii[4..])
1157                            {
1158                                // 63 ASCII characters is the max length for a valid DNS label and xn-- takes 4
1159                                // characters.
1160                                let mut label_buffer = SmallVec::<[char; 59]>::new();
1161                                label_buffer.extend(decode);
1162
1163                                if self.after_punycode_decode(
1164                                    domain_buffer,
1165                                    current_label_start,
1166                                    &label_buffer,
1167                                    deny_list_deny_dot,
1168                                    fail_fast,
1169                                    &mut had_errors,
1170                                ) {
1171                                    return (0, false, true);
1172                                }
1173
1174                                if self.check_label(
1175                                    hyphens,
1176                                    &mut domain_buffer[current_label_start..],
1177                                    fail_fast,
1178                                    &mut had_errors,
1179                                    true,
1180                                    true,
1181                                ) {
1182                                    return (0, false, true);
1183                                }
1184                            } else {
1185                                // Punycode failed
1186                                if fail_fast {
1187                                    return (0, false, true);
1188                                }
1189                                had_errors = true;
1190                                domain_buffer.push('\u{FFFD}');
1191                                let mut iter = ascii.iter();
1192                                // Discard the first character that we replaced.
1193                                let _ = iter.next();
1194                                domain_buffer.extend(iter.map(|c| {
1195                                    // Can't have dot here, so `deny_list` vs `deny_list_deny_dot` does
1196                                    // not matter.
1197                                    apply_ascii_deny_list_to_potentially_upper_case_ascii(
1198                                        *c, deny_list,
1199                                    )
1200                                }));
1201                            };
1202                            // If there were errors, we won't be trying to use this
1203                            // anyway later, so it's fine to put it here unconditionally.
1204                            already_punycode.push(AlreadyAsciiLabel::MixedCasePunycode(label));
1205                            continue;
1206                        } else if fail_fast {
1207                            return (0, false, true);
1208                        }
1209                        // Else fall through to the complex path and rediscover error
1210                        // there.
1211                        false
1212                    } else {
1213                        true
1214                    }
1215                } else {
1216                    false
1217                };
1218                for c in ascii.iter().map(|c| {
1219                    // Can't have dot here, so `deny_list` vs `deny_list_deny_dot` does
1220                    // not matter.
1221                    apply_ascii_deny_list_to_potentially_upper_case_ascii(*c, deny_list)
1222                }) {
1223                    if c == '\u{FFFD}' {
1224                        if fail_fast {
1225                            return (0, false, true);
1226                        }
1227                        had_errors = true;
1228                    }
1229                    domain_buffer.push(c);
1230                }
1231                if non_punycode_ascii_label {
1232                    if hyphens != Hyphens::Allow
1233                        && check_hyphens(
1234                            &mut domain_buffer[current_label_start..],
1235                            hyphens == Hyphens::CheckFirstLast,
1236                            fail_fast,
1237                            &mut had_errors,
1238                        )
1239                    {
1240                        return (0, false, true);
1241                    }
1242                    already_punycode.push(if had_errors {
1243                        AlreadyAsciiLabel::Other
1244                    } else {
1245                        AlreadyAsciiLabel::MixedCaseAscii(label)
1246                    });
1247                    continue;
1248                }
1249                already_punycode.push(AlreadyAsciiLabel::Other);
1250                let mut first_needs_combining_mark_check = ascii.is_empty();
1251                let mut needs_contextj_check = !non_ascii.is_empty();
1252                let mut mapping = self
1253                    .data
1254                    .map_normalize(non_ascii.chars())
1255                    .map(|c| apply_ascii_deny_list_to_lower_cased_unicode(c, deny_list));
1256                loop {
1257                    let n = mapping.next();
1258                    match n {
1259                        None | Some('.') => {
1260                            if domain_buffer[current_label_start..]
1261                                .starts_with(&['x', 'n', '-', '-'])
1262                            {
1263                                let mut punycode_precondition_failed = false;
1264                                for c in domain_buffer[current_label_start + 4..].iter_mut() {
1265                                    if !c.is_ascii() {
1266                                        if fail_fast {
1267                                            return (0, false, true);
1268                                        }
1269                                        had_errors = true;
1270                                        *c = '\u{FFFD}';
1271                                        punycode_precondition_failed = true;
1272                                    }
1273                                }
1274
1275                                if let Some(last) = domain_buffer.last_mut() {
1276                                    if *last == '-' {
1277                                        // Either there's nothing after the "xn--" prefix
1278                                        // and we got the last hyphen of "xn--", or there
1279                                        // are no Punycode digits after the last delimiter
1280                                        // which would result in Punycode decode outputting
1281                                        // ASCII only.
1282                                        if fail_fast {
1283                                            return (0, false, true);
1284                                        }
1285                                        had_errors = true;
1286                                        *last = '\u{FFFD}';
1287                                        punycode_precondition_failed = true;
1288                                    }
1289                                } else {
1290                                    unreachable!();
1291                                }
1292
1293                                // Reject excessively long input
1294                                // https://github.com/whatwg/url/issues/824
1295                                // https://unicode-org.atlassian.net/browse/ICU-13727
1296                                if domain_buffer.len() - current_label_start - 4
1297                                    > PUNYCODE_DECODE_MAX_INPUT_LENGTH
1298                                {
1299                                    if fail_fast {
1300                                        return (0, false, true);
1301                                    }
1302                                    had_errors = true;
1303                                    domain_buffer[current_label_start
1304                                        + 4
1305                                        + PUNYCODE_DECODE_MAX_INPUT_LENGTH] = '\u{FFFD}';
1306                                    punycode_precondition_failed = true;
1307                                }
1308
1309                                if !punycode_precondition_failed {
1310                                    if let Ok(decode) = Decoder::default()
1311                                        .decode::<char, InternalCaller>(
1312                                            &domain_buffer[current_label_start + 4..],
1313                                        )
1314                                    {
1315                                        first_needs_combining_mark_check = true;
1316                                        needs_contextj_check = true;
1317                                        // 63 ASCII characters is the max length for a valid DNS label and xn-- takes 4
1318                                        // characters.
1319                                        let mut label_buffer = SmallVec::<[char; 59]>::new();
1320                                        label_buffer.extend(decode);
1321
1322                                        domain_buffer.truncate(current_label_start);
1323                                        if self.after_punycode_decode(
1324                                            domain_buffer,
1325                                            current_label_start,
1326                                            &label_buffer,
1327                                            deny_list_deny_dot,
1328                                            fail_fast,
1329                                            &mut had_errors,
1330                                        ) {
1331                                            return (0, false, true);
1332                                        }
1333                                    } else {
1334                                        // Punycode failed
1335                                        if fail_fast {
1336                                            return (0, false, true);
1337                                        }
1338                                        had_errors = true;
1339                                        domain_buffer[current_label_start] = '\u{FFFD}';
1340                                        needs_contextj_check = false; // ASCII label
1341                                        first_needs_combining_mark_check = false;
1342                                    };
1343                                } else {
1344                                    first_needs_combining_mark_check = false;
1345                                    needs_contextj_check = false; // Non-ASCII already turned to U+FFFD.
1346                                }
1347                            }
1348                            if self.check_label(
1349                                hyphens,
1350                                &mut domain_buffer[current_label_start..],
1351                                fail_fast,
1352                                &mut had_errors,
1353                                first_needs_combining_mark_check,
1354                                needs_contextj_check,
1355                            ) {
1356                                return (0, false, true);
1357                            }
1358
1359                            if n.is_none() {
1360                                break;
1361                            }
1362                            domain_buffer.push('.');
1363                            current_label_start = domain_buffer.len();
1364                            first_needs_combining_mark_check = true;
1365                            needs_contextj_check = true;
1366                            already_punycode.push(AlreadyAsciiLabel::Other);
1367                        }
1368                        Some(c) => {
1369                            if c == '\u{FFFD}' {
1370                                if fail_fast {
1371                                    return (0, false, true);
1372                                }
1373                                had_errors = true;
1374                            }
1375                            domain_buffer.push(c);
1376                        }
1377                    }
1378                }
1379            } else {
1380                // Empty label
1381                already_punycode.push(AlreadyAsciiLabel::MixedCaseAscii(label));
1382            }
1383        }
1384
1385        let is_bidi = self.is_bidi(domain_buffer);
1386        if is_bidi {
1387            for label in domain_buffer.split_mut(|c| *c == '.') {
1388                if let Some((first, tail)) = label.split_first_mut() {
1389                    let first_bc = self.data.bidi_class(*first);
1390                    if !FIRST_BC_MASK.intersects(first_bc.to_mask()) {
1391                        // Neither RTL label nor LTR label
1392                        if fail_fast {
1393                            return (0, false, true);
1394                        }
1395                        had_errors = true;
1396                        *first = '\u{FFFD}';
1397                        continue;
1398                    }
1399                    let is_ltr = first_bc.is_ltr();
1400                    // Trim NSM
1401                    let mut middle = tail;
1402                    #[allow(clippy::while_let_loop)]
1403                    loop {
1404                        if let Some((last, prior)) = middle.split_last_mut() {
1405                            let last_bc = self.data.bidi_class(*last);
1406                            if last_bc.is_nonspacing_mark() {
1407                                middle = prior;
1408                                continue;
1409                            }
1410                            let last_mask = if is_ltr { LAST_LTR_MASK } else { LAST_RTL_MASK };
1411                            if !last_mask.intersects(last_bc.to_mask()) {
1412                                if fail_fast {
1413                                    return (0, false, true);
1414                                }
1415                                had_errors = true;
1416                                *last = '\u{FFFD}';
1417                            }
1418                            if is_ltr {
1419                                for c in prior.iter_mut() {
1420                                    let bc = self.data.bidi_class(*c);
1421                                    if !MIDDLE_LTR_MASK.intersects(bc.to_mask()) {
1422                                        if fail_fast {
1423                                            return (0, false, true);
1424                                        }
1425                                        had_errors = true;
1426                                        *c = '\u{FFFD}';
1427                                    }
1428                                }
1429                            } else {
1430                                let mut numeral_state = RtlNumeralState::Undecided;
1431                                for c in prior.iter_mut() {
1432                                    let bc = self.data.bidi_class(*c);
1433                                    if !MIDDLE_RTL_MASK.intersects(bc.to_mask()) {
1434                                        if fail_fast {
1435                                            return (0, false, true);
1436                                        }
1437                                        had_errors = true;
1438                                        *c = '\u{FFFD}';
1439                                    } else {
1440                                        match numeral_state {
1441                                            RtlNumeralState::Undecided => {
1442                                                if bc.is_european_number() {
1443                                                    numeral_state = RtlNumeralState::European;
1444                                                } else if bc.is_arabic_number() {
1445                                                    numeral_state = RtlNumeralState::Arabic;
1446                                                }
1447                                            }
1448                                            RtlNumeralState::European => {
1449                                                if bc.is_arabic_number() {
1450                                                    if fail_fast {
1451                                                        return (0, false, true);
1452                                                    }
1453                                                    had_errors = true;
1454                                                    *c = '\u{FFFD}';
1455                                                }
1456                                            }
1457                                            RtlNumeralState::Arabic => {
1458                                                if bc.is_european_number() {
1459                                                    if fail_fast {
1460                                                        return (0, false, true);
1461                                                    }
1462                                                    had_errors = true;
1463                                                    *c = '\u{FFFD}';
1464                                                }
1465                                            }
1466                                        }
1467                                    }
1468                                }
1469                                if (numeral_state == RtlNumeralState::European
1470                                    && last_bc.is_arabic_number())
1471                                    || (numeral_state == RtlNumeralState::Arabic
1472                                        && last_bc.is_european_number())
1473                                {
1474                                    if fail_fast {
1475                                        return (0, false, true);
1476                                    }
1477                                    had_errors = true;
1478                                    *last = '\u{FFFD}';
1479                                }
1480                            }
1481                            break;
1482                        } else {
1483                            // One-character label or label where
1484                            // everything after the first character
1485                            // is just non-spacing marks.
1486                            break;
1487                        }
1488                    }
1489                }
1490            }
1491        }
1492
1493        (passthrough_up_to, is_bidi, had_errors)
1494    }
1495
1496    #[inline(never)]
1497    fn after_punycode_decode(
1498        &self,
1499        domain_buffer: &mut SmallVec<[char; 253]>,
1500        current_label_start: usize,
1501        label_buffer: &[char],
1502        deny_list_deny_dot: u128,
1503        fail_fast: bool,
1504        had_errors: &mut bool,
1505    ) -> bool {
1506        for c in self
1507            .data
1508            .normalize_validate(label_buffer.iter().copied())
1509            .map(|c| apply_ascii_deny_list_to_lower_cased_unicode(c, deny_list_deny_dot))
1510        {
1511            if c == '\u{FFFD}' {
1512                if fail_fast {
1513                    return true;
1514                }
1515                *had_errors = true;
1516            }
1517            domain_buffer.push(c);
1518        }
1519        let normalized = &mut domain_buffer[current_label_start..];
1520        if let Err(()) =
1521            normalized
1522                .iter_mut()
1523                .zip(label_buffer.iter())
1524                .try_for_each(|(norm_c, decoded_c)| {
1525                    if *norm_c == *decoded_c {
1526                        Ok(())
1527                    } else {
1528                        // Mark the first difference
1529                        *norm_c = '\u{FFFD}';
1530                        Err(())
1531                    }
1532                })
1533        {
1534            if fail_fast {
1535                return true;
1536            }
1537            *had_errors = true;
1538        }
1539        false
1540    }
1541
1542    #[inline(never)]
1543    fn check_label(
1544        &self,
1545        hyphens: Hyphens,
1546        mut_label: &mut [char],
1547        fail_fast: bool,
1548        had_errors: &mut bool,
1549        first_needs_combining_mark_check: bool,
1550        needs_contextj_check: bool,
1551    ) -> bool {
1552        if hyphens != Hyphens::Allow
1553            && check_hyphens(
1554                mut_label,
1555                hyphens == Hyphens::CheckFirstLast,
1556                fail_fast,
1557                had_errors,
1558            )
1559        {
1560            return true;
1561        }
1562        if first_needs_combining_mark_check {
1563            if let Some(first) = mut_label.first_mut() {
1564                if self.data.is_mark(*first) {
1565                    if fail_fast {
1566                        return true;
1567                    }
1568                    *had_errors = true;
1569                    *first = '\u{FFFD}';
1570                }
1571            }
1572        }
1573        if needs_contextj_check {
1574            // ContextJ
1575            for i in 0..mut_label.len() {
1576                let c = mut_label[i];
1577                if !in_inclusive_range_char(c, '\u{200C}', '\u{200D}') {
1578                    continue;
1579                }
1580                let (head, joiner_and_tail) = mut_label.split_at_mut(i);
1581
1582                if let Some((joiner, tail)) = joiner_and_tail.split_first_mut() {
1583                    if let Some(previous) = head.last() {
1584                        if self.data.is_virama(*previous) {
1585                            continue;
1586                        }
1587                    } else {
1588                        // No preceding character
1589                        if fail_fast {
1590                            return true;
1591                        }
1592                        *had_errors = true;
1593                        *joiner = '\u{FFFD}';
1594                        continue;
1595                    }
1596                    if c == '\u{200D}' {
1597                        // ZWJ only has the virama rule
1598                        if fail_fast {
1599                            return true;
1600                        }
1601                        *had_errors = true;
1602                        *joiner = '\u{FFFD}';
1603                        continue;
1604                    }
1605                    debug_assert_eq!(c, '\u{200C}');
1606                    if !self.has_appropriately_joining_char(
1607                        head.iter().rev().copied(),
1608                        LEFT_OR_DUAL_JOINING_MASK,
1609                    ) || !self.has_appropriately_joining_char(
1610                        tail.iter().copied(),
1611                        RIGHT_OR_DUAL_JOINING_MASK,
1612                    ) {
1613                        if fail_fast {
1614                            return true;
1615                        }
1616                        *had_errors = true;
1617                        *joiner = '\u{FFFD}';
1618                    }
1619                } else {
1620                    debug_assert!(false);
1621                }
1622            }
1623        }
1624
1625        if !is_ascii(mut_label) && mut_label.len() > PUNYCODE_ENCODE_MAX_INPUT_LENGTH {
1626            // Limit quadratic behavior
1627            // https://github.com/whatwg/url/issues/824
1628            // https://unicode-org.atlassian.net/browse/ICU-13727
1629            if fail_fast {
1630                return true;
1631            }
1632            *had_errors = true;
1633            mut_label[PUNYCODE_ENCODE_MAX_INPUT_LENGTH] = '\u{FFFD}';
1634        }
1635        false
1636    }
1637
1638    #[inline(always)]
1639    fn has_appropriately_joining_char<I: Iterator<Item = char>>(
1640        &self,
1641        iter: I,
1642        required_mask: JoiningTypeMask,
1643    ) -> bool {
1644        for c in iter {
1645            let jt = self.data.joining_type(c);
1646            if jt.to_mask().intersects(required_mask) {
1647                return true;
1648            }
1649            if jt.is_transparent() {
1650                continue;
1651            }
1652            return false;
1653        }
1654        false
1655    }
1656
1657    #[inline(always)]
1658    fn is_bidi(&self, buffer: &[char]) -> bool {
1659        for &c in buffer {
1660            if c < '\u{0590}' {
1661                // Below Hebrew
1662                continue;
1663            }
1664            if in_inclusive_range_char(c, '\u{0900}', '\u{FB1C}') {
1665                debug_assert_ne!(c, '\u{200F}'); // disallowed
1666                continue;
1667            }
1668            if in_inclusive_range_char(c, '\u{1F000}', '\u{3FFFF}') {
1669                continue;
1670            }
1671            if in_inclusive_range_char(c, '\u{FF00}', '\u{107FF}') {
1672                continue;
1673            }
1674            if in_inclusive_range_char(c, '\u{11000}', '\u{1E7FF}') {
1675                continue;
1676            }
1677            if RTL_MASK.intersects(self.data.bidi_class(c).to_mask()) {
1678                return true;
1679            }
1680        }
1681        false
1682    }
1683}
1684
1685fn check_hyphens(
1686    mut_label: &mut [char],
1687    allow_third_fourth: bool,
1688    fail_fast: bool,
1689    had_errors: &mut bool,
1690) -> bool {
1691    if let Some(first) = mut_label.first_mut() {
1692        if *first == '-' {
1693            if fail_fast {
1694                return true;
1695            }
1696            *had_errors = true;
1697            *first = '\u{FFFD}';
1698        }
1699    }
1700    if let Some(last) = mut_label.last_mut() {
1701        if *last == '-' {
1702            if fail_fast {
1703                return true;
1704            }
1705            *had_errors = true;
1706            *last = '\u{FFFD}';
1707        }
1708    }
1709    if allow_third_fourth {
1710        return false;
1711    }
1712    if mut_label.len() >= 4 && mut_label[2] == '-' && mut_label[3] == '-' {
1713        if fail_fast {
1714            return true;
1715        }
1716        *had_errors = true;
1717        mut_label[2] = '\u{FFFD}';
1718        mut_label[3] = '\u{FFFD}';
1719    }
1720    false
1721}
idna/uts46.rs

idna/
uts46.rs