swash/text/
unicode.rs

1/*!
2Unicode character properties.
3*/
4
5pub use super::compose::Decompose;
6#[doc(inline)]
7pub use super::unicode_data::{
8    BidiClass, Block, Category, ClusterBreak, JoiningType, LineBreak, Script, WordBreak,
9    UNICODE_VERSION,
10};
11
12use super::compose::{compose_pair, decompose, decompose_compat};
13use super::unicode_data::{
14    get_record_index, MyanmarClass, Record, UseClass, BRACKETS, MIRRORS, RECORDS, SCRIPTS_BY_TAG,
15    SCRIPT_COMPLEXITY, SCRIPT_NAMES, SCRIPT_TAGS,
16};
17use crate::Tag;
18
19use core::char::from_u32_unchecked;
20
21const RECORD_MASK: u16 = 0x1FFF;
22const BOUNDARY_SHIFT: u16 = 13;
23
24/// Compact, constant time reference to Unicode properties for a character.
25#[derive(Copy, Clone, PartialEq, Eq, Default)]
26pub struct Properties(u16);
27
28impl Properties {
29    fn new(ch: u32) -> Self {
30        Self(get_record_index(ch as usize) as u16)
31    }
32
33    /// Returns the category of the character.
34    pub fn category(self) -> Category {
35        self.record().category
36    }
37
38    /// Returns the unicode block that contains the character.
39    pub fn block(self) -> Block {
40        self.record().block
41    }
42
43    /// Returns the script to which the character belongs.
44    pub fn script(self) -> Script {
45        self.record().script
46    }
47
48    /// Returns the canonical combining class of the character.
49    pub fn combining_class(self) -> u8 {
50        self.record().combining_class
51    }
52
53    /// Returns the bidirectional type of the character.
54    pub fn bidi_class(self) -> BidiClass {
55        self.record().bidi_class
56    }
57
58    /// Returns the joining type of the character.
59    pub fn joining_type(self) -> JoiningType {
60        self.record().joining_type
61    }
62
63    /// Returns the cluster break property for the character.
64    pub fn cluster_break(self) -> ClusterBreak {
65        self.record().cluster_break
66    }
67
68    /// Returns the word break property for the character.
69    pub fn word_break(self) -> WordBreak {
70        self.record().word_break
71    }
72
73    /// Returns the line break property for the character.
74    pub fn line_break(self) -> LineBreak {
75        self.record().line_break
76    }
77
78    /// Returns true if the character is an emoji.
79    pub fn is_emoji(self) -> bool {
80        self.record().flags.is_emoji()
81    }
82
83    /// Returns true if the character is an extended pictographic symbol.
84    pub fn is_extended_pictographic(self) -> bool {
85        self.record().flags.is_extended_pictographic()
86    }
87
88    /// Returns true if the character is an opening bracket.
89    pub fn is_open_bracket(self) -> bool {
90        self.record().flags.is_open_bracket()
91    }
92
93    /// Returns true if the character is a closing bracket.
94    pub fn is_close_bracket(self) -> bool {
95        self.record().flags.is_close_bracket()
96    }
97
98    pub(crate) fn is_ignorable(self) -> bool {
99        self.record().flags.is_ignorable()
100    }
101
102    pub(crate) fn is_variation_selector(self) -> bool {
103        self.record().flags.is_variation_selector()
104    }
105
106    pub(crate) fn contributes_to_shaping(self) -> bool {
107        self.record().flags.contributes_to_shaping()
108    }
109
110    pub(crate) fn with_boundary(mut self, b: u16) -> Self {
111        self.set_boundary(b);
112        self
113    }
114
115    pub(crate) fn boundary(self) -> u16 {
116        self.0 >> BOUNDARY_SHIFT
117    }
118
119    pub(crate) fn set_boundary(&mut self, boundary: u16) {
120        self.0 = (self.0 & RECORD_MASK) | (boundary & 0b11) << BOUNDARY_SHIFT;
121    }
122
123    pub(crate) fn use_class(self) -> (UseClass, bool, bool) {
124        let r = self.record();
125        (
126            r.use_class,
127            r.flags.needs_decomp(),
128            r.flags.is_extended_pictographic(),
129        )
130    }
131
132    pub(crate) fn myanmar_class(self) -> (MyanmarClass, bool) {
133        let r = self.record();
134        (r.myanmar_class, r.flags.is_extended_pictographic())
135    }
136
137    pub(crate) fn cluster_class(self) -> (ClusterBreak, bool) {
138        let r = self.record();
139        (r.cluster_break, r.flags.is_extended_pictographic())
140    }
141
142    #[inline(always)]
143    fn record(self) -> &'static Record {
144        // SAFETY: The inner index can only be generated by the private
145        // constructor which produces an in-bounds record index.
146        unsafe { RECORDS.get_unchecked((self.0 & RECORD_MASK) as usize) }
147    }
148}
149
150impl From<char> for Properties {
151    fn from(ch: char) -> Self {
152        Self::new(ch as u32)
153    }
154}
155
156impl From<&'_ char> for Properties {
157    fn from(ch: &'_ char) -> Self {
158        Self::new(*ch as u32)
159    }
160}
161
162impl From<u32> for Properties {
163    fn from(ch: u32) -> Self {
164        Self::new(ch)
165    }
166}
167
168impl From<&'_ u32> for Properties {
169    fn from(ch: &'_ u32) -> Self {
170        Self::new(*ch)
171    }
172}
173
174/// Trait that exposes Unicode properties for the `char` type.
175pub trait Codepoint: Sized + Copy {
176    /// Returns the codepoint properties.
177    fn properties(self) -> Properties;
178
179    /// Returns the category of the character.
180    fn category(self) -> Category {
181        self.properties().category()
182    }
183
184    /// Returns the unicode block that contains the character.
185    fn block(self) -> Block {
186        self.properties().block()
187    }
188
189    /// Returns the script to which the character belongs.
190    fn script(self) -> Script {
191        self.properties().script()
192    }
193
194    /// Returns the canonical combining class of the character.
195    fn combining_class(self) -> u8 {
196        self.properties().combining_class()
197    }
198
199    /// Returns the bidirectional type of the character.
200    fn bidi_class(self) -> BidiClass {
201        self.properties().bidi_class()
202    }
203
204    /// Returns the joining type of the character.
205    fn joining_type(self) -> JoiningType {
206        self.properties().joining_type()
207    }
208
209    /// Returns the cluster break property for the character.
210    fn cluster_break(self) -> ClusterBreak {
211        self.properties().cluster_break()
212    }
213
214    /// Returns the word break property for the character.
215    fn word_break(self) -> WordBreak {
216        self.properties().word_break()
217    }
218
219    /// Returns the line break property for the character.
220    fn line_break(self) -> LineBreak {
221        self.properties().line_break()
222    }
223
224    /// Returns true if the character is an emoji.
225    fn is_emoji(self) -> bool {
226        self.properties().is_emoji()
227    }
228
229    /// Returns true if the character is an extended pictographic symbol.
230    fn is_extended_pictographic(self) -> bool {
231        self.properties().is_extended_pictographic()
232    }
233
234    /// Returns the bracket type of the character.
235    fn bracket_type(self) -> BracketType;
236
237    /// If the character is a closing bracket, returns its opening bracket
238    /// pair.
239    fn opening_bracket(self) -> Option<char>;
240
241    /// If the character is an opening bracket, returns its closing bracket
242    /// pair.
243    fn closing_bracket(self) -> Option<char>;
244
245    /// Returns the mirror of the character, if any.
246    fn mirror(self) -> Option<char>;
247
248    /// Returns the composition of two characters, if any.
249    fn compose(a: char, b: char) -> Option<char>;
250
251    /// Returns the canonical decomposition of the character.
252    fn decompose(self) -> Decompose;
253
254    /// Returns the compatibility decomposition of the character.
255    fn decompose_compatible(self) -> Decompose;
256}
257
258impl Codepoint for char {
259    fn properties(self) -> Properties {
260        Properties::from(self)
261    }
262
263    fn bracket_type(self) -> BracketType {
264        match self.closing_bracket() {
265            Some(other) => BracketType::Open(other),
266            _ => match self.opening_bracket() {
267                Some(other) => BracketType::Close(other),
268                _ => BracketType::None,
269            },
270        }
271    }
272
273    fn opening_bracket(self) -> Option<char> {
274        let c = self as u32;
275        if let Ok(idx) = BRACKETS.binary_search_by(|x| (x.1 as u32).cmp(&c)) {
276            return Some(unsafe { from_u32_unchecked(BRACKETS[idx].0 as u32) });
277        }
278        None
279    }
280
281    fn closing_bracket(self) -> Option<char> {
282        let c = self as u32;
283        if let Ok(idx) = BRACKETS.binary_search_by(|x| (x.0 as u32).cmp(&c)) {
284            return Some(unsafe { from_u32_unchecked(BRACKETS[idx].1 as u32) });
285        }
286        None
287    }
288
289    fn mirror(self) -> Option<char> {
290        let c = self as u32;
291        if let Ok(idx) = MIRRORS.binary_search_by(|x| (x.0 as u32).cmp(&c)) {
292            return Some(unsafe { from_u32_unchecked(MIRRORS[idx].1 as u32) });
293        }
294        None
295    }
296
297    fn compose(a: char, b: char) -> Option<char> {
298        compose_pair(a, b)
299    }
300
301    fn decompose(self) -> Decompose {
302        decompose(self)
303    }
304
305    fn decompose_compatible(self) -> Decompose {
306        decompose_compat(self)
307    }
308}
309
310/// Bracket type of a character.
311#[derive(Copy, Clone, PartialEq, Eq, Debug)]
312pub enum BracketType {
313    /// Not a bracket.
314    None,
315    /// An opening bracket with the associated closing bracket.
316    Open(char),
317    /// A closing bracket with the associated opening bracket.
318    Close(char),
319}
320
321impl Script {
322    /// Returns the script associated with the specified OpenType language
323    /// tag.
324    pub fn from_opentype(tag: Tag) -> Option<Self> {
325        match SCRIPTS_BY_TAG.binary_search_by(|x| x.0.cmp(&tag)) {
326            Ok(index) => Some(SCRIPTS_BY_TAG[index].1),
327            _ => None,
328        }
329    }
330
331    /// Returns the name of the script.
332    pub fn name(self) -> &'static str {
333        SCRIPT_NAMES[self as usize]
334    }
335
336    /// Returns true if the script requires complex shaping.
337    pub fn is_complex(self) -> bool {
338        SCRIPT_COMPLEXITY[self as usize]
339    }
340
341    /// Returns true if the script has cursive joining.
342    pub fn is_joined(self) -> bool {
343        matches!(
344            self,
345            Script::Arabic
346                | Script::Mongolian
347                | Script::Syriac
348                | Script::Nko
349                | Script::PhagsPa
350                | Script::Mandaic
351                | Script::Manichaean
352                | Script::PsalterPahlavi
353                | Script::Adlam
354        )
355    }
356
357    /// Returns the script as an OpenType tag.
358    pub fn to_opentype(self) -> Tag {
359        SCRIPT_TAGS[self as usize]
360    }
361}
362
363impl WordBreak {
364    pub(crate) const fn mask(self) -> u32 {
365        1 << (self as u32)
366    }
367}
368
369impl BidiClass {
370    /// Returns the bidi class as a 32 bit bitmask.
371    pub const fn mask(self) -> u32 {
372        1 << (self as u32)
373    }
374
375    /// Returns true if the presence of this bidi class requires
376    /// resolution.
377    pub fn needs_resolution(self) -> bool {
378        use BidiClass::*;
379        const OVERRIDE_MASK: u32 = RLE.mask() | LRE.mask() | RLO.mask() | LRO.mask();
380        const ISOLATE_MASK: u32 = RLI.mask() | LRI.mask() | FSI.mask();
381        const EXPLICIT_MASK: u32 = OVERRIDE_MASK | ISOLATE_MASK;
382        const BIDI_MASK: u32 = EXPLICIT_MASK | R.mask() | AL.mask() | AN.mask();
383        self.mask() & BIDI_MASK != 0
384    }
385}