swash/text/cluster/
complex.rs

1//! A complex cluster parser based on Microsoft's Universal Shaping Engine
2//! specification.
3
4use super::super::{Category, Codepoint, Script};
5use super::unicode_data::{ClusterBreak, UseClass};
6use super::{CharCluster, Emoji, ShapeClass, Token, Whitespace, MAX_CLUSTER_SIZE};
7
8type Kind = UseClass;
9
10pub struct ComplexState<I> {
11    chars: Tokens<I>,
12    cur: Token,
13    cur_kind: Kind,
14    cur_emoji: bool,
15    done: bool,
16}
17
18impl<I> ComplexState<I>
19where
20    I: Iterator<Item = Token> + Clone,
21{
22    pub fn new(script: Script, chars: I) -> Self {
23        let mut chars = Tokens::new(script, chars);
24        if let Some((first, kind, emoji)) = chars.by_ref().next() {
25            Self {
26                chars,
27                cur: first,
28                cur_kind: kind,
29                cur_emoji: emoji,
30                done: false,
31            }
32        } else {
33            Self {
34                chars,
35                cur: Token::default(),
36                cur_kind: UseClass::O,
37                cur_emoji: false,
38                done: true,
39            }
40        }
41    }
42
43    pub fn next(&mut self, cluster: &mut CharCluster) -> bool {
44        if self.done {
45            return false;
46        }
47        Parser::new(self, cluster).parse();
48        true
49    }
50}
51
52struct Parser<'a, I> {
53    s: &'a mut ComplexState<I>,
54    cluster: &'a mut CharCluster,
55    vt: bool,
56}
57
58impl<'a, I> Parser<'a, I>
59where
60    I: Iterator<Item = Token> + Clone,
61{
62    fn new(s: &'a mut ComplexState<I>, cluster: &'a mut CharCluster) -> Self {
63        Self {
64            s,
65            cluster,
66            vt: false,
67        }
68    }
69
70    fn parse(&mut self) -> Option<()> {
71        use UseClass::*;
72        if self.s.done {
73            return Some(());
74        }
75        if self.emoji() {
76            self.cluster.info_mut().set_emoji(Emoji::Default);
77            while self.emoji() {
78                self.accept_any_as(ShapeClass::Base)?;
79                if !self.parse_emoji_extension()? {
80                    break;
81                }
82            }
83            return Some(());
84        }
85        match self.kind() {
86            O => {
87                // This is not in the USE spec, but added to support uniform
88                // clustering of CRLF across the parsers.
89                match self.s.cur.ch {
90                    '\r' => {
91                        self.cluster.info_mut().set_space(Whitespace::Newline);
92                        self.accept_any_as(ShapeClass::Control)?;
93                        if self.s.cur.ch == '\n' {
94                            self.accept_any_as(ShapeClass::Control)?;
95                        }
96                    }
97                    '\n' => {
98                        self.cluster.info_mut().set_space(Whitespace::Newline);
99                        self.accept_any_as(ShapeClass::Control)?;
100                    }
101                    _ => {
102                        self.cluster.info_mut().set_space_from_char(self.s.cur.ch);
103                        let class = match self.s.cur.info.category() {
104                            Category::Format => match self.s.cur.ch as u32 {
105                                0x200C => ShapeClass::Zwnj,
106                                0x200D => ShapeClass::Zwj,
107                                _ => ShapeClass::Control,
108                            },
109                            Category::Control => ShapeClass::Control,
110                            _ => ShapeClass::Base,
111                        };
112                        self.accept_any_as(class)?;
113                    }
114                }
115            }
116            IND | Rsv | WJ => {
117                self.accept_any_as(ShapeClass::Base)?;
118                self.accept_as(VS, ShapeClass::Vs)?;
119            }
120            R => {
121                self.accept_any_as(ShapeClass::Reph)?;
122                self.parse_standard(false)?;
123            }
124            CS => {
125                self.accept_any()?;
126                self.parse_standard(false)?;
127            }
128            B | GB => {
129                let is_potential_symbol = self.kind() == GB;
130                self.parse_standard(is_potential_symbol)?;
131            }
132            N => {
133                self.accept_any_as(ShapeClass::Base)?;
134                self.accept_as(VS, ShapeClass::Vs)?;
135                while self.parse_halant_number()? {}
136            }
137            S => {
138                self.accept_any_as(ShapeClass::Base)?;
139                self.accept_as(VS, ShapeClass::Vs)?;
140                self.accept_zero_or_many(SMAbv)?;
141                self.accept_zero_or_many(SMBlw)?;
142            }
143            _ => {
144                self.parse_standard(false)?;
145            }
146        }
147        None
148    }
149
150    fn parse_standard(&mut self, is_potential_symbol: bool) -> Option<()> {
151        use UseClass::*;
152        match self.kind() {
153            B | GB => {
154                self.accept_any_as(ShapeClass::Base)?;
155                self.parse_standard_tail(is_potential_symbol)?;
156            }
157            _ => {
158                self.cluster.info_mut().set_broken();
159                self.accept_any_as(self.kind().to_shape_class())?;
160            }
161        }
162        Some(())
163    }
164
165    fn parse_standard_tail(&mut self, is_potential_symbol: bool) -> Option<()> {
166        use UseClass::*;
167        self.accept_as(VS, ShapeClass::Vs)?;
168        let k = self.kind();
169        if is_potential_symbol && (k == SMAbv || k == SMBlw) {
170            self.accept_zero_or_many(SMAbv)?;
171            self.accept_zero_or_many(SMBlw)?;
172            return Some(());
173        }
174        self.accept_zero_or_many(CMAbv);
175        self.accept_zero_or_many(CMBlw);
176        while self.parse_halant_base()? {}
177        if self.vt {
178            return Some(());
179        }
180        self.accept(MPre)?;
181        self.accept(MAbv)?;
182        self.accept(MBlw)?;
183        self.accept(MBlw)?;
184        self.accept(MPst)?;
185        self.accept_zero_or_many_as(VPre, ShapeClass::VPre)?;
186        self.accept_zero_or_many(VAbv)?;
187        self.accept_zero_or_many(VBlw)?;
188        self.accept_zero_or_many(VPst)?;
189        while self.parse_vowel_modifier()? {}
190        self.accept_zero_or_many(FAbv)?;
191        self.accept_zero_or_many(FBlw)?;
192        self.accept_zero_or_many(FPst)?;
193        self.accept(FM)?;
194        Some(())
195    }
196
197    fn parse_vowel_modifier(&mut self) -> Option<bool> {
198        use UseClass::*;
199        Some(match self.kind() {
200            VMPre => {
201                self.accept_any_as(ShapeClass::VMPre)?;
202                true
203            }
204            VMAbv | VMBlw | VMPst => {
205                self.accept_any()?;
206                true
207            }
208            // Spec break: some scripts allow a virama as a vowel modifier and
209            // there are particular cases of split vowel characters that
210            // decompose into vowel + halant. Accept a halant here, but emit
211            // it as Other to avoid any effects on reordering.
212            H => {
213                self.accept_any()?;
214                true
215            }
216            _ => false,
217        })
218    }
219
220    fn parse_halant_base(&mut self) -> Option<bool> {
221        use UseClass::*;
222        self.vt = false;
223        match self.kind() {
224            SUB => {
225                self.accept_any()?;
226                self.accept_zero_or_many(CMAbv)?;
227                self.accept_zero_or_many(CMBlw)?;
228                return Some(true);
229            }
230            H => {
231                self.vt = true;
232                if self.s.chars.script == Script::Khmer && self.s.cur.ch == '\u{17d2}' {
233                    self.accept_any_as(ShapeClass::Other)?;
234                } else {
235                    self.accept_any_as(ShapeClass::Halant)?;
236                }
237                match self.kind() {
238                    B => {
239                        self.vt = false;
240                        self.accept_any_as(ShapeClass::Base)?;
241                        self.accept_as(VS, ShapeClass::Vs)?;
242                        self.accept_zero_or_many(CMAbv)?;
243                        self.accept_zero_or_many(CMBlw)?;
244                        return Some(true);
245                    }
246                    _ => {
247                        return Some(false);
248                    }
249                }
250            }
251            _ => {}
252        }
253        Some(false)
254    }
255
256    fn parse_halant_number(&mut self) -> Option<bool> {
257        use UseClass::*;
258        match self.kind() {
259            HN => {
260                self.accept_any_as(ShapeClass::Halant)?;
261                match self.kind() {
262                    N => {
263                        self.accept_any_as(ShapeClass::Base)?;
264                        self.accept_as(VS, ShapeClass::Vs)?;
265                        Some(true)
266                    }
267                    _ => Some(false),
268                }
269            }
270            _ => None,
271        }
272    }
273
274    fn parse_emoji_extension(&mut self) -> Option<bool> {
275        use ClusterBreak::*;
276        loop {
277            match self.s.cur.info.cluster_break() {
278                EX => match self.s.cur.ch as u32 {
279                    0x200C => self.accept_any_as(ShapeClass::Zwnj)?,
280                    0xFE0F => {
281                        self.cluster.info_mut().set_emoji(Emoji::Color);
282                        self.cluster.note_char(&self.s.cur);
283                        self.advance()?;
284                    }
285                    0xFE0E => {
286                        self.cluster.info_mut().set_emoji(Emoji::Text);
287                        self.cluster.note_char(&self.s.cur);
288                        self.advance()?;
289                    }
290                    _ => self.accept_any_as(ShapeClass::Mark)?,
291                },
292                ZWJ => {
293                    self.accept_any_as(ShapeClass::Zwj)?;
294                    return Some(true);
295                }
296                _ => break,
297            }
298        }
299        Some(false)
300    }
301
302    #[inline(always)]
303    fn emoji(&self) -> bool {
304        self.s.cur_emoji
305    }
306
307    #[inline(always)]
308    fn kind(&self) -> Kind {
309        self.s.cur_kind
310    }
311
312    fn accept(&mut self, kind: Kind) -> Option<bool> {
313        self.accept_as(kind, ShapeClass::Other)
314    }
315
316    fn accept_as(&mut self, kind: Kind, as_class: ShapeClass) -> Option<bool> {
317        if self.s.cur_kind == kind {
318            self.accept_any_as(as_class)?;
319            Some(true)
320        } else {
321            Some(false)
322        }
323    }
324
325    fn accept_zero_or_many(&mut self, kind: Kind) -> Option<bool> {
326        let mut some = false;
327        while self.accept(kind)? {
328            some = true;
329        }
330        Some(some)
331    }
332
333    fn accept_zero_or_many_as(&mut self, kind: Kind, as_class: ShapeClass) -> Option<bool> {
334        let mut some = false;
335        while self.accept_as(kind, as_class)? {
336            some = true;
337        }
338        Some(some)
339    }
340
341    fn accept_any(&mut self) -> Option<()> {
342        self.cluster.push(&self.s.cur, ShapeClass::Other);
343        self.advance()?;
344        Some(())
345    }
346
347    fn accept_any_as(&mut self, as_class: ShapeClass) -> Option<()> {
348        self.cluster.push(&self.s.cur, as_class);
349        self.advance()?;
350        Some(())
351    }
352
353    fn advance(&mut self) -> Option<()> {
354        if self.cluster.len() as usize == MAX_CLUSTER_SIZE {
355            return None;
356        }
357        if let Some((input, kind, emoji)) = self.s.chars.next() {
358            self.s.cur = input;
359            self.s.cur_emoji = emoji;
360            self.s.cur_kind = kind;
361            if input.ch == '\u{34f}' {
362                self.accept_any_as(ShapeClass::Other)?;
363            }
364            Some(())
365        } else {
366            self.s.done = true;
367            None
368        }
369    }
370}
371
372impl UseClass {
373    pub fn to_shape_class(self) -> ShapeClass {
374        match self {
375            Self::B => ShapeClass::Base,
376            Self::H => ShapeClass::Halant,
377            Self::VPre => ShapeClass::VPre,
378            Self::VMPre => ShapeClass::VMPre,
379            Self::VBlw => ShapeClass::VBlw,
380            Self::R => ShapeClass::Reph,
381            Self::ZWNJ => ShapeClass::Zwnj,
382            Self::ZWJ => ShapeClass::Zwj,
383            _ => ShapeClass::Other,
384        }
385    }
386}
387
388#[derive(Clone)]
389struct Tokens<I> {
390    iter: I,
391    decomp: [(Token, UseClass); 3],
392    decomp_len: u8,
393    decomp_offset: u8,
394    script: Script,
395}
396
397impl<I> Tokens<I> {
398    fn new(script: Script, iter: I) -> Self {
399        Self {
400            iter,
401            decomp: [(Token::default(), UseClass::O); 3],
402            decomp_len: 0,
403            decomp_offset: 0,
404            script,
405        }
406    }
407}
408
409impl<I> Iterator for Tokens<I>
410where
411    I: Iterator<Item = Token> + Clone,
412{
413    type Item = (Token, UseClass, bool);
414
415    fn next(&mut self) -> Option<Self::Item> {
416        if self.decomp_offset < self.decomp_len {
417            let (input, class) = self.decomp[self.decomp_offset as usize];
418            self.decomp_offset += 1;
419            Some((input, class, false))
420        } else {
421            let input = self.iter.next()?;
422            let (class, needs_decomp, emoji) = input.info.use_class();
423            if needs_decomp {
424                self.decomp_offset = 0;
425                self.decomp_len = 0;
426                for c in input.ch.decompose().chars() {
427                    if self.decomp_len == 3 {
428                        // shouldn't happen
429                        break;
430                    }
431                    let props = c.properties();
432                    let (class, ..) = props.use_class();
433                    let c2 = Token {
434                        ch: *c,
435                        info: input.info.with_properties(props),
436                        ..input
437                    };
438                    self.decomp[self.decomp_len as usize] = (c2, class);
439                    self.decomp_len += 1;
440                }
441                //self.decomp[..self.decomp_len as usize].reverse(); //.sort_unstable_by(|a, b| a.3.cmp(&b.3));
442                return self.next();
443            } else if self.script == Script::Khmer {
444                match input.ch as u32 {
445                    0x17BE | 0x17BF | 0x17C0 | 0x17C4 | 0x17C5 => {
446                        let a = '\u{17C1}';
447                        let props = a.properties();
448                        let a_class = props.use_class().0;
449                        let a = Token {
450                            ch: a,
451                            info: input.info.with_properties(props),
452                            ..input
453                        };
454                        self.decomp[0] = (a, a_class);
455                        self.decomp[1] = (input, class);
456                        self.decomp_len = 2;
457                        self.decomp_offset = 0;
458                        return self.next();
459                    }
460                    _ => {}
461                }
462            }
463            Some((input, class, emoji))
464        }
465    }
466}