zerovec/ule/
chars.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5#![allow(clippy::upper_case_acronyms)]
6//! ULE implementation for the `char` type.
7
8use super::*;
9use crate::impl_ule_from_array;
10use core::cmp::Ordering;
11use core::convert::TryFrom;
12
13/// A u8 array of little-endian data corresponding to a Unicode scalar value.
14///
15/// The bytes of a `CharULE` are guaranteed to represent a little-endian-encoded u32 that is a
16/// valid `char` and can be converted without validation.
17///
18/// # Examples
19///
20/// Convert a `char` to a `CharULE` and back again:
21///
22/// ```
23/// use zerovec::ule::{AsULE, CharULE, ULE};
24///
25/// let c1 = '𑄃';
26/// let ule = c1.to_unaligned();
27/// assert_eq!(CharULE::slice_as_bytes(&[ule]), &[0x03, 0x11, 0x01]);
28/// let c2 = char::from_unaligned(ule);
29/// assert_eq!(c1, c2);
30/// ```
31///
32/// Attempt to parse invalid bytes to a `CharULE`:
33///
34/// ```
35/// use zerovec::ule::{CharULE, ULE};
36///
37/// let bytes: &[u8] = &[0xFF, 0xFF, 0xFF, 0xFF];
38/// CharULE::parse_bytes_to_slice(bytes).expect_err("Invalid bytes");
39/// ```
40#[repr(transparent)]
41#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
42pub struct CharULE([u8; 3]);
43
44impl CharULE {
45    /// Converts a [`char`] to a [`CharULE`]. This is equivalent to calling
46    /// [`AsULE::to_unaligned()`]
47    ///
48    /// See the type-level documentation for [`CharULE`] for more information.
49    #[inline]
50    pub const fn from_aligned(c: char) -> Self {
51        let [u0, u1, u2, _u3] = (c as u32).to_le_bytes();
52        Self([u0, u1, u2])
53    }
54
55    /// Converts this [`CharULE`] to a [`char`]. This is equivalent to calling
56    /// [`AsULE::from_unaligned`]
57    ///
58    /// See the type-level documentation for [`CharULE`] for more information.
59    #[inline]
60    pub fn to_char(self) -> char {
61        let [b0, b1, b2] = self.0;
62        // Safe because the bytes of CharULE are defined to represent a valid Unicode scalar value.
63        unsafe { char::from_u32_unchecked(u32::from_le_bytes([b0, b1, b2, 0])) }
64    }
65
66    impl_ule_from_array!(char, CharULE, Self([0; 3]));
67}
68
69// Safety (based on the safety checklist on the ULE trait):
70//  1. CharULE does not include any uninitialized or padding bytes.
71//     (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
72//  2. CharULE is aligned to 1 byte.
73//     (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
74//  3. The impl of validate_bytes() returns an error if any byte is not valid.
75//  4. The impl of validate_bytes() returns an error if there are extra bytes.
76//  5. The other ULE methods use the default impl.
77//  6. CharULE byte equality is semantic equality
78unsafe impl ULE for CharULE {
79    #[inline]
80    fn validate_bytes(bytes: &[u8]) -> Result<(), UleError> {
81        if bytes.len() % 3 != 0 {
82            return Err(UleError::length::<Self>(bytes.len()));
83        }
84        // Validate the bytes
85        for chunk in bytes.chunks_exact(3) {
86            // TODO: Use slice::as_chunks() when stabilized
87            #[allow(clippy::indexing_slicing)]
88            // Won't panic because the chunks are always 3 bytes long
89            let u = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]);
90            char::try_from(u).map_err(|_| UleError::parse::<Self>())?;
91        }
92        Ok(())
93    }
94}
95
96impl AsULE for char {
97    type ULE = CharULE;
98
99    #[inline]
100    fn to_unaligned(self) -> Self::ULE {
101        CharULE::from_aligned(self)
102    }
103
104    #[inline]
105    fn from_unaligned(unaligned: Self::ULE) -> Self {
106        unaligned.to_char()
107    }
108}
109
110impl PartialOrd for CharULE {
111    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
112        Some(self.cmp(other))
113    }
114}
115
116impl Ord for CharULE {
117    fn cmp(&self, other: &Self) -> Ordering {
118        char::from_unaligned(*self).cmp(&char::from_unaligned(*other))
119    }
120}
121
122#[cfg(test)]
123mod test {
124    use super::*;
125
126    #[test]
127    fn test_from_array() {
128        const CHARS: [char; 2] = ['a', '🙃'];
129        const CHARS_ULE: [CharULE; 2] = CharULE::from_array(CHARS);
130        assert_eq!(
131            CharULE::slice_as_bytes(&CHARS_ULE),
132            &[0x61, 0x00, 0x00, 0x43, 0xF6, 0x01]
133        );
134    }
135
136    #[test]
137    fn test_from_array_zst() {
138        const CHARS: [char; 0] = [];
139        const CHARS_ULE: [CharULE; 0] = CharULE::from_array(CHARS);
140        let bytes = CharULE::slice_as_bytes(&CHARS_ULE);
141        let empty: &[u8] = &[];
142        assert_eq!(bytes, empty);
143    }
144
145    #[test]
146    fn test_parse() {
147        // 1-byte, 2-byte, 3-byte, and two 4-byte character in UTF-8 (not as relevant in UTF-32)
148        let chars = ['w', 'ω', '文', '𑄃', '🙃'];
149        let char_ules: Vec<CharULE> = chars.iter().copied().map(char::to_unaligned).collect();
150        let char_bytes: &[u8] = CharULE::slice_as_bytes(&char_ules);
151
152        // Check parsing
153        let parsed_ules: &[CharULE] = CharULE::parse_bytes_to_slice(char_bytes).unwrap();
154        assert_eq!(char_ules, parsed_ules);
155        let parsed_chars: Vec<char> = parsed_ules
156            .iter()
157            .copied()
158            .map(char::from_unaligned)
159            .collect();
160        assert_eq!(&chars, parsed_chars.as_slice());
161
162        // Compare to golden expected data
163        assert_eq!(
164            &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1],
165            char_bytes
166        );
167    }
168
169    #[test]
170    fn test_failures() {
171        // 119 and 120 are valid, but not 0xD800 (high surrogate)
172        let u32s = [119, 0xD800, 120];
173        let u32_ules: Vec<RawBytesULE<4>> = u32s
174            .iter()
175            .copied()
176            .map(<u32 as AsULE>::to_unaligned)
177            .collect();
178        let u32_bytes: &[u8] = RawBytesULE::<4>::slice_as_bytes(&u32_ules);
179        let parsed_ules_result = CharULE::parse_bytes_to_slice(u32_bytes);
180        assert!(parsed_ules_result.is_err());
181
182        // 0x20FFFF is out of range for a char
183        let u32s = [0x20FFFF];
184        let u32_ules: Vec<RawBytesULE<4>> = u32s
185            .iter()
186            .copied()
187            .map(<u32 as AsULE>::to_unaligned)
188            .collect();
189        let u32_bytes: &[u8] = RawBytesULE::<4>::slice_as_bytes(&u32_ules);
190        let parsed_ules_result = CharULE::parse_bytes_to_slice(u32_bytes);
191        assert!(parsed_ules_result.is_err());
192    }
193}