1use super::super::{Category, Codepoint, Script};
5use super::unicode_data::{ClusterBreak, UseClass};
6use super::{CharCluster, Emoji, ShapeClass, Token, Whitespace, MAX_CLUSTER_SIZE};
7
8type Kind = UseClass;
9
10pub struct ComplexState<I> {
11 chars: Tokens<I>,
12 cur: Token,
13 cur_kind: Kind,
14 cur_emoji: bool,
15 done: bool,
16}
17
18impl<I> ComplexState<I>
19where
20 I: Iterator<Item = Token> + Clone,
21{
22 pub fn new(script: Script, chars: I) -> Self {
23 let mut chars = Tokens::new(script, chars);
24 if let Some((first, kind, emoji)) = chars.by_ref().next() {
25 Self {
26 chars,
27 cur: first,
28 cur_kind: kind,
29 cur_emoji: emoji,
30 done: false,
31 }
32 } else {
33 Self {
34 chars,
35 cur: Token::default(),
36 cur_kind: UseClass::O,
37 cur_emoji: false,
38 done: true,
39 }
40 }
41 }
42
43 pub fn next(&mut self, cluster: &mut CharCluster) -> bool {
44 if self.done {
45 return false;
46 }
47 Parser::new(self, cluster).parse();
48 true
49 }
50}
51
52struct Parser<'a, I> {
53 s: &'a mut ComplexState<I>,
54 cluster: &'a mut CharCluster,
55 vt: bool,
56}
57
58impl<'a, I> Parser<'a, I>
59where
60 I: Iterator<Item = Token> + Clone,
61{
62 fn new(s: &'a mut ComplexState<I>, cluster: &'a mut CharCluster) -> Self {
63 Self {
64 s,
65 cluster,
66 vt: false,
67 }
68 }
69
70 fn parse(&mut self) -> Option<()> {
71 use UseClass::*;
72 if self.s.done {
73 return Some(());
74 }
75 if self.emoji() {
76 self.cluster.info_mut().set_emoji(Emoji::Default);
77 while self.emoji() {
78 self.accept_any_as(ShapeClass::Base)?;
79 if !self.parse_emoji_extension()? {
80 break;
81 }
82 }
83 return Some(());
84 }
85 match self.kind() {
86 O => {
87 match self.s.cur.ch {
90 '\r' => {
91 self.cluster.info_mut().set_space(Whitespace::Newline);
92 self.accept_any_as(ShapeClass::Control)?;
93 if self.s.cur.ch == '\n' {
94 self.accept_any_as(ShapeClass::Control)?;
95 }
96 }
97 '\n' => {
98 self.cluster.info_mut().set_space(Whitespace::Newline);
99 self.accept_any_as(ShapeClass::Control)?;
100 }
101 _ => {
102 self.cluster.info_mut().set_space_from_char(self.s.cur.ch);
103 let class = match self.s.cur.info.category() {
104 Category::Format => match self.s.cur.ch as u32 {
105 0x200C => ShapeClass::Zwnj,
106 0x200D => ShapeClass::Zwj,
107 _ => ShapeClass::Control,
108 },
109 Category::Control => ShapeClass::Control,
110 _ => ShapeClass::Base,
111 };
112 self.accept_any_as(class)?;
113 }
114 }
115 }
116 IND | Rsv | WJ => {
117 self.accept_any_as(ShapeClass::Base)?;
118 self.accept_as(VS, ShapeClass::Vs)?;
119 }
120 R => {
121 self.accept_any_as(ShapeClass::Reph)?;
122 self.parse_standard(false)?;
123 }
124 CS => {
125 self.accept_any()?;
126 self.parse_standard(false)?;
127 }
128 B | GB => {
129 let is_potential_symbol = self.kind() == GB;
130 self.parse_standard(is_potential_symbol)?;
131 }
132 N => {
133 self.accept_any_as(ShapeClass::Base)?;
134 self.accept_as(VS, ShapeClass::Vs)?;
135 while self.parse_halant_number()? {}
136 }
137 S => {
138 self.accept_any_as(ShapeClass::Base)?;
139 self.accept_as(VS, ShapeClass::Vs)?;
140 self.accept_zero_or_many(SMAbv)?;
141 self.accept_zero_or_many(SMBlw)?;
142 }
143 _ => {
144 self.parse_standard(false)?;
145 }
146 }
147 None
148 }
149
150 fn parse_standard(&mut self, is_potential_symbol: bool) -> Option<()> {
151 use UseClass::*;
152 match self.kind() {
153 B | GB => {
154 self.accept_any_as(ShapeClass::Base)?;
155 self.parse_standard_tail(is_potential_symbol)?;
156 }
157 _ => {
158 self.cluster.info_mut().set_broken();
159 self.accept_any_as(self.kind().to_shape_class())?;
160 }
161 }
162 Some(())
163 }
164
165 fn parse_standard_tail(&mut self, is_potential_symbol: bool) -> Option<()> {
166 use UseClass::*;
167 self.accept_as(VS, ShapeClass::Vs)?;
168 let k = self.kind();
169 if is_potential_symbol && (k == SMAbv || k == SMBlw) {
170 self.accept_zero_or_many(SMAbv)?;
171 self.accept_zero_or_many(SMBlw)?;
172 return Some(());
173 }
174 self.accept_zero_or_many(CMAbv);
175 self.accept_zero_or_many(CMBlw);
176 while self.parse_halant_base()? {}
177 if self.vt {
178 return Some(());
179 }
180 self.accept(MPre)?;
181 self.accept(MAbv)?;
182 self.accept(MBlw)?;
183 self.accept(MBlw)?;
184 self.accept(MPst)?;
185 self.accept_zero_or_many_as(VPre, ShapeClass::VPre)?;
186 self.accept_zero_or_many(VAbv)?;
187 self.accept_zero_or_many(VBlw)?;
188 self.accept_zero_or_many(VPst)?;
189 while self.parse_vowel_modifier()? {}
190 self.accept_zero_or_many(FAbv)?;
191 self.accept_zero_or_many(FBlw)?;
192 self.accept_zero_or_many(FPst)?;
193 self.accept(FM)?;
194 Some(())
195 }
196
197 fn parse_vowel_modifier(&mut self) -> Option<bool> {
198 use UseClass::*;
199 Some(match self.kind() {
200 VMPre => {
201 self.accept_any_as(ShapeClass::VMPre)?;
202 true
203 }
204 VMAbv | VMBlw | VMPst => {
205 self.accept_any()?;
206 true
207 }
208 H => {
213 self.accept_any()?;
214 true
215 }
216 _ => false,
217 })
218 }
219
220 fn parse_halant_base(&mut self) -> Option<bool> {
221 use UseClass::*;
222 self.vt = false;
223 match self.kind() {
224 SUB => {
225 self.accept_any()?;
226 self.accept_zero_or_many(CMAbv)?;
227 self.accept_zero_or_many(CMBlw)?;
228 return Some(true);
229 }
230 H => {
231 self.vt = true;
232 if self.s.chars.script == Script::Khmer && self.s.cur.ch == '\u{17d2}' {
233 self.accept_any_as(ShapeClass::Other)?;
234 } else {
235 self.accept_any_as(ShapeClass::Halant)?;
236 }
237 match self.kind() {
238 B => {
239 self.vt = false;
240 self.accept_any_as(ShapeClass::Base)?;
241 self.accept_as(VS, ShapeClass::Vs)?;
242 self.accept_zero_or_many(CMAbv)?;
243 self.accept_zero_or_many(CMBlw)?;
244 return Some(true);
245 }
246 _ => {
247 return Some(false);
248 }
249 }
250 }
251 _ => {}
252 }
253 Some(false)
254 }
255
256 fn parse_halant_number(&mut self) -> Option<bool> {
257 use UseClass::*;
258 match self.kind() {
259 HN => {
260 self.accept_any_as(ShapeClass::Halant)?;
261 match self.kind() {
262 N => {
263 self.accept_any_as(ShapeClass::Base)?;
264 self.accept_as(VS, ShapeClass::Vs)?;
265 Some(true)
266 }
267 _ => Some(false),
268 }
269 }
270 _ => None,
271 }
272 }
273
274 fn parse_emoji_extension(&mut self) -> Option<bool> {
275 use ClusterBreak::*;
276 loop {
277 match self.s.cur.info.cluster_break() {
278 EX => match self.s.cur.ch as u32 {
279 0x200C => self.accept_any_as(ShapeClass::Zwnj)?,
280 0xFE0F => {
281 self.cluster.info_mut().set_emoji(Emoji::Color);
282 self.cluster.note_char(&self.s.cur);
283 self.advance()?;
284 }
285 0xFE0E => {
286 self.cluster.info_mut().set_emoji(Emoji::Text);
287 self.cluster.note_char(&self.s.cur);
288 self.advance()?;
289 }
290 _ => self.accept_any_as(ShapeClass::Mark)?,
291 },
292 ZWJ => {
293 self.accept_any_as(ShapeClass::Zwj)?;
294 return Some(true);
295 }
296 _ => break,
297 }
298 }
299 Some(false)
300 }
301
302 #[inline(always)]
303 fn emoji(&self) -> bool {
304 self.s.cur_emoji
305 }
306
307 #[inline(always)]
308 fn kind(&self) -> Kind {
309 self.s.cur_kind
310 }
311
312 fn accept(&mut self, kind: Kind) -> Option<bool> {
313 self.accept_as(kind, ShapeClass::Other)
314 }
315
316 fn accept_as(&mut self, kind: Kind, as_class: ShapeClass) -> Option<bool> {
317 if self.s.cur_kind == kind {
318 self.accept_any_as(as_class)?;
319 Some(true)
320 } else {
321 Some(false)
322 }
323 }
324
325 fn accept_zero_or_many(&mut self, kind: Kind) -> Option<bool> {
326 let mut some = false;
327 while self.accept(kind)? {
328 some = true;
329 }
330 Some(some)
331 }
332
333 fn accept_zero_or_many_as(&mut self, kind: Kind, as_class: ShapeClass) -> Option<bool> {
334 let mut some = false;
335 while self.accept_as(kind, as_class)? {
336 some = true;
337 }
338 Some(some)
339 }
340
341 fn accept_any(&mut self) -> Option<()> {
342 self.cluster.push(&self.s.cur, ShapeClass::Other);
343 self.advance()?;
344 Some(())
345 }
346
347 fn accept_any_as(&mut self, as_class: ShapeClass) -> Option<()> {
348 self.cluster.push(&self.s.cur, as_class);
349 self.advance()?;
350 Some(())
351 }
352
353 fn advance(&mut self) -> Option<()> {
354 if self.cluster.len() as usize == MAX_CLUSTER_SIZE {
355 return None;
356 }
357 if let Some((input, kind, emoji)) = self.s.chars.next() {
358 self.s.cur = input;
359 self.s.cur_emoji = emoji;
360 self.s.cur_kind = kind;
361 if input.ch == '\u{34f}' {
362 self.accept_any_as(ShapeClass::Other)?;
363 }
364 Some(())
365 } else {
366 self.s.done = true;
367 None
368 }
369 }
370}
371
372impl UseClass {
373 pub fn to_shape_class(self) -> ShapeClass {
374 match self {
375 Self::B => ShapeClass::Base,
376 Self::H => ShapeClass::Halant,
377 Self::VPre => ShapeClass::VPre,
378 Self::VMPre => ShapeClass::VMPre,
379 Self::VBlw => ShapeClass::VBlw,
380 Self::R => ShapeClass::Reph,
381 Self::ZWNJ => ShapeClass::Zwnj,
382 Self::ZWJ => ShapeClass::Zwj,
383 _ => ShapeClass::Other,
384 }
385 }
386}
387
388#[derive(Clone)]
389struct Tokens<I> {
390 iter: I,
391 decomp: [(Token, UseClass); 3],
392 decomp_len: u8,
393 decomp_offset: u8,
394 script: Script,
395}
396
397impl<I> Tokens<I> {
398 fn new(script: Script, iter: I) -> Self {
399 Self {
400 iter,
401 decomp: [(Token::default(), UseClass::O); 3],
402 decomp_len: 0,
403 decomp_offset: 0,
404 script,
405 }
406 }
407}
408
409impl<I> Iterator for Tokens<I>
410where
411 I: Iterator<Item = Token> + Clone,
412{
413 type Item = (Token, UseClass, bool);
414
415 fn next(&mut self) -> Option<Self::Item> {
416 if self.decomp_offset < self.decomp_len {
417 let (input, class) = self.decomp[self.decomp_offset as usize];
418 self.decomp_offset += 1;
419 Some((input, class, false))
420 } else {
421 let input = self.iter.next()?;
422 let (class, needs_decomp, emoji) = input.info.use_class();
423 if needs_decomp {
424 self.decomp_offset = 0;
425 self.decomp_len = 0;
426 for c in input.ch.decompose().chars() {
427 if self.decomp_len == 3 {
428 break;
430 }
431 let props = c.properties();
432 let (class, ..) = props.use_class();
433 let c2 = Token {
434 ch: *c,
435 info: input.info.with_properties(props),
436 ..input
437 };
438 self.decomp[self.decomp_len as usize] = (c2, class);
439 self.decomp_len += 1;
440 }
441 return self.next();
443 } else if self.script == Script::Khmer {
444 match input.ch as u32 {
445 0x17BE | 0x17BF | 0x17C0 | 0x17C4 | 0x17C5 => {
446 let a = '\u{17C1}';
447 let props = a.properties();
448 let a_class = props.use_class().0;
449 let a = Token {
450 ch: a,
451 info: input.info.with_properties(props),
452 ..input
453 };
454 self.decomp[0] = (a, a_class);
455 self.decomp[1] = (input, class);
456 self.decomp_len = 2;
457 self.decomp_offset = 0;
458 return self.next();
459 }
460 _ => {}
461 }
462 }
463 Some((input, class, emoji))
464 }
465 }
466}