swash/text/
analyze.rs

1use super::{cluster::Boundary, Codepoint, LineBreak, Properties, WordBreak};
2use core::borrow::Borrow;
3
4/// Returns an iterator yielding unicode properties and boundary analysis for
5/// each character in the specified sequence.
6pub fn analyze<I>(chars: I) -> Analyze<I::IntoIter>
7where
8    I: IntoIterator,
9    I::IntoIter: Clone,
10    I::Item: Borrow<char>,
11{
12    Analyze {
13        chars: chars.into_iter(),
14        state: BoundaryState::new(),
15    }
16}
17
18/// Iterator that yields Unicode properties and boundary analysis.
19/// This iterator is created by the [`analyze`] function.
20#[derive(Clone)]
21pub struct Analyze<I> {
22    chars: I,
23    state: BoundaryState,
24}
25
26impl<I> Iterator for Analyze<I>
27where
28    I: Iterator + Clone,
29    I::Item: Borrow<char>,
30{
31    type Item = (Properties, Boundary);
32
33    fn next(&mut self) -> Option<Self::Item> {
34        self.state.next(&mut self.chars)
35    }
36}
37
38impl<I> Analyze<I> {
39    /// Returns true if the analysis indicates that bidi resolution is
40    /// required.
41    pub fn needs_bidi_resolution(&self) -> bool {
42        self.state.needs_bidi
43    }
44
45    /// Sets the word breaking strength that will be used to analyze the next character.
46    pub fn set_break_strength(&mut self, strength: WordBreakStrength) {
47        self.state.strength = strength;
48    }
49}
50
51/// Word breaking strength (corresponds to <https://drafts.csswg.org/css-text/#word-break-property>).
52#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Default)]
53#[repr(u8)]
54pub enum WordBreakStrength {
55    /// Words can be broken according to their normal Unicode rules.
56    #[default]
57    Normal,
58    /// Breaking treats numeric, alphabetic, and Southeast Asian classes as Ideographic. Note that this does not affect
59    /// breaking punctuation.
60    BreakAll,
61    /// Breaking between typographic letter units or the NU, AL, AI, or ID classes is prohibited.
62    KeepAll,
63}
64
65#[derive(Clone)]
66struct BoundaryState {
67    strength: WordBreakStrength,
68    prev: WordBreak,
69    prevent_next: bool,
70    ri_count: u8,
71    emoji: bool,
72    next_emoji: bool,
73    line_state: (u8, Option<LineBreak>),
74    first: bool,
75    needs_bidi: bool,
76}
77
78impl BoundaryState {
79    fn new() -> Self {
80        Self {
81            strength: WordBreakStrength::default(),
82            prev: WordBreak::EX,
83            prevent_next: false,
84            ri_count: 0,
85            emoji: false,
86            next_emoji: false,
87            line_state: (sot, None),
88            first: true,
89            needs_bidi: false,
90        }
91    }
92
93    fn reset_state(&mut self) {
94        self.ri_count = 0;
95        self.emoji = false;
96    }
97
98    fn check_word<I>(&mut self, props: Properties, iter: &mut I) -> bool
99    where
100        I: Iterator + Clone,
101        I::Item: Borrow<char>,
102    {
103        use WordBreak::*;
104        let b = props.word_break();
105        let emoji = props.is_extended_pictographic();
106        if self.first {
107            self.first = false;
108            self.prev = b;
109            self.next_emoji = emoji;
110            if b == RI {
111                self.ri_count = 1;
112            }
113            return true;
114        }
115        let prev_emoji = self.emoji;
116        self.emoji = self.emoji || self.next_emoji;
117        self.next_emoji = emoji;
118        let a = self.prev;
119        self.prev = b;
120        if self.prevent_next {
121            self.prevent_next = false;
122            return false;
123        }
124        if a == CR && b == LF {
125            self.reset_state();
126            return false;
127        }
128        let a_mask = a.mask();
129        let b_mask = b.mask();
130        const AH_LETTER: u32 = LE.mask() | HL.mask();
131        const MID_NUM_LET_Q: u32 = MB.mask() | SQ.mask();
132        const WB3_A: u32 = NL.mask() | CR.mask() | LF.mask();
133        if a_mask & WB3_A != 0 || b_mask & WB3_A != 0 {
134            // (Newline | CR | LF) ÷
135            // ÷ (Newline | CR | LF)
136            self.reset_state();
137            return true;
138        }
139        if a == ZWJ && emoji {
140            self.reset_state();
141            return false;
142        }
143        const WB_4: u32 = Extend.mask() | FO.mask() | ZWJ.mask();
144        if b_mask & WB_4 != 0 {
145            // Ignore format and extend characters
146            self.reset_state();
147            self.prev = a;
148            return false;
149        }
150        if a == WSegSpace && b == WSegSpace {
151            // WSegSpace × WSegSpace
152            self.reset_state();
153            return false;
154        }
155        if a_mask & AH_LETTER != 0 {
156            // AHLetter × AHLetter
157            // AHLetter × Numeric
158            if b_mask & (AH_LETTER | NU.mask()) != 0 {
159                self.reset_state();
160                return false;
161            }
162            if b_mask & (ML.mask() | MID_NUM_LET_Q) != 0 {
163                // AHLetter	× (MidLetter | MidNumLetQ) AHLetter
164                // AHLetter (MidLetter | MidNumLetQ) × AHLetter
165                if let Some(c) = iter
166                    .clone()
167                    .next()
168                    .map(|p| p.borrow().properties().word_break())
169                {
170                    if c.mask() & AH_LETTER != 0 {
171                        self.prevent_next = true;
172                        self.reset_state();
173                        return false;
174                    }
175                }
176            }
177        }
178        if a == HL {
179            if b == SQ {
180                self.reset_state();
181                return false;
182            }
183            if b == DQ {
184                // Hebrew_Letter × Double_Quote Hebrew_Letter
185                // Hebrew_Letter Double_Quote × Hebrew_Letter
186                if let Some(c) = iter
187                    .clone()
188                    .next()
189                    .map(|p| p.borrow().properties().word_break())
190                {
191                    if c == HL {
192                        self.prevent_next = true;
193                        self.reset_state();
194                        return false;
195                    }
196                }
197            }
198        }
199        if a_mask & NU.mask() != 0 {
200            // Numeric × Numeric
201            // Numeric × AHLetter
202            if b_mask & (NU.mask() | AH_LETTER) != 0 {
203                self.reset_state();
204                return false;
205            }
206            if b_mask & (MN.mask() | MID_NUM_LET_Q) != 0 {
207                if let Some(c) = iter
208                    .clone()
209                    .next()
210                    .map(|p| p.borrow().properties().word_break())
211                {
212                    // Numeric (MidNum | MidNumLetQ) × Numeric
213                    // Numeric × (MidNum | MidNumLetQ) Numeric
214                    if c == NU {
215                        self.prevent_next = true;
216                        self.reset_state();
217                        return false;
218                    }
219                }
220            }
221        }
222        if a == KA && b == KA {
223            // Katakana × Katakana
224            self.reset_state();
225            return false;
226        }
227        const WB13_A: u32 = AH_LETTER | NU.mask() | KA.mask() | EX.mask();
228        if a_mask & WB13_A != 0 && b == EX {
229            // (AHLetter | Numeric | Katakana | ExtendNumLet) ×	ExtendNumLet
230            self.reset_state();
231            return false;
232        }
233        const WB13_B: u32 = AH_LETTER | NU.mask() | KA.mask();
234        if a == EX && b_mask & WB13_B != 0 {
235            // ExtendNumLet × (AHLetter | Numeric | Katakana)
236            self.reset_state();
237            return false;
238        }
239        if prev_emoji && a == ZWJ && emoji {
240            self.ri_count = 0;
241            return false;
242        }
243        if self.ri_count == 2 {
244            self.reset_state();
245            if b == RI {
246                self.ri_count = 1;
247            }
248            return true;
249        }
250        if b == RI {
251            self.ri_count += 1;
252            if a != RI {
253                self.reset_state();
254                self.ri_count = 1;
255                return true;
256            }
257            self.emoji = false;
258            return false;
259        }
260        self.reset_state();
261        true
262    }
263
264    fn check_line(&mut self, props: Properties) -> Boundary {
265        let state = self.line_state;
266        let lb = props.line_break();
267
268        use LineBreak::*;
269
270        let val = PAIR_TABLE[state.0 as usize][lb as usize];
271
272        // word-break: break-all
273        //
274        // Treat the NU, AL, and SA line breaking classes as ID.
275        let mode_val = if self.strength == WordBreakStrength::BreakAll {
276            let left = if matches!(state.1, Some(AL | NU | SA)) {
277                ID as usize
278            } else {
279                state.0 as usize
280            };
281            let right = if matches!(lb, AL | NU | SA) {
282                ID as usize
283            } else {
284                lb as usize
285            };
286            PAIR_TABLE[left][right]
287        } else {
288            val
289        };
290
291        let mut mode = if mode_val & MANDATORY_BREAK_BIT != 0 {
292            Boundary::Mandatory
293        } else if mode_val & ALLOWED_BREAK_BIT != 0 && state.1 != Some(ZWJ) {
294            Boundary::Line
295        } else {
296            Boundary::None
297        };
298
299        // word-break: keep-all
300        //
301        // Prohibit breaking between typographic letter units or the NU, AL, or
302        // AI, or ID classes.
303        // (See https://github.com/unicode-org/icu4x/blob/1e27279/components/segmenter/src/line.rs#L836-L840)
304        if let (
305            WordBreakStrength::KeepAll,
306            Some(AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ),
307            AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
308        ) = (self.strength, state.1, lb)
309        {
310            mode = Boundary::None;
311        }
312
313        // Store the original value, not the modified one.
314        self.line_state = (val & !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT), Some(lb));
315        mode
316    }
317
318    fn next<I>(&mut self, iter: &mut I) -> Option<(Properties, Boundary)>
319    where
320        I: Iterator + Clone,
321        I::Item: Borrow<char>,
322    {
323        let props = iter.next()?.borrow().properties();
324        let mut boundary = self.check_line(props);
325        let word = self.check_word(props, iter);
326        if boundary as u16 == 0 && word {
327            boundary = Boundary::Word;
328        }
329        self.needs_bidi = self.needs_bidi || props.bidi_class().needs_resolution();
330        Some((props, boundary))
331    }
332}
333
334const ALLOWED_BREAK_BIT: u8 = 0x80;
335const MANDATORY_BREAK_BIT: u8 = 0x40;
336
337#[allow(non_upper_case_globals)]
338const sot: u8 = 44;
339
340#[rustfmt::skip]
341const PAIR_TABLE: [[u8; 44]; 53] = [
342    [1,1,130,3,132,5,134,28,8,1,10,11,140,141,14,15,144,145,18,19,148,21,22,151,152,153,26,27,28,29,30,31,32,33,162,1,1,37,38,39,1,41,1,235,],
343    [1,1,130,3,132,5,134,28,8,1,10,11,140,141,14,15,144,145,18,19,148,21,22,151,152,153,26,27,28,29,30,31,32,33,162,1,1,37,38,39,1,41,1,235,],
344    [129,129,2,3,132,5,134,28,8,2,10,11,140,141,14,15,144,145,146,19,148,149,22,151,152,153,26,27,28,157,158,159,160,33,162,129,129,50,38,39,129,41,2,235,],
345    [129,129,130,3,132,5,134,28,8,3,10,11,140,141,14,143,144,145,146,19,148,149,22,151,152,153,26,27,28,157,158,159,160,33,162,129,129,37,38,39,129,41,3,235,],
346    [1,1,2,3,4,5,134,28,8,4,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,1,1,37,38,39,1,41,4,235,],
347    [193,193,194,195,196,197,198,220,200,193,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,193,193,229,230,231,193,233,193,235,],
348    [129,129,130,131,132,5,134,156,8,6,10,11,140,141,14,15,144,145,146,147,148,149,22,151,152,153,26,27,156,157,158,159,160,33,162,129,129,37,38,39,129,41,6,235,],
349    [129,129,130,3,132,5,134,28,8,28,10,11,140,141,14,15,144,145,146,19,148,149,22,151,152,153,26,27,28,157,158,159,160,33,162,129,129,37,38,39,129,41,28,235,],
350    [129,129,130,3,132,5,134,28,8,8,10,11,140,141,14,15,144,145,146,19,148,149,22,151,152,153,26,27,28,157,158,31,32,33,162,129,129,48,38,39,129,41,8,235,],
351    [1,1,130,3,132,5,134,28,8,9,10,11,140,141,14,15,144,145,18,19,148,21,22,151,152,153,26,27,28,29,30,31,32,33,162,1,1,37,38,39,1,41,9,235,],
352    [1,1,130,3,132,5,134,28,8,10,10,11,140,141,14,15,144,145,18,19,148,149,22,151,152,153,26,27,28,29,158,31,32,33,162,1,1,49,38,39,1,41,10,235,],
353    [193,193,194,195,196,197,198,220,200,193,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,26,219,220,221,222,223,224,225,226,193,193,229,230,231,193,233,193,235,],
354    [129,129,130,3,132,5,134,28,8,12,10,11,140,13,14,15,144,145,146,19,148,21,22,151,152,153,26,27,28,157,158,31,160,33,162,129,129,37,38,39,129,41,12,235,],
355    [129,129,130,3,132,5,134,28,8,13,10,11,140,141,14,15,144,145,146,19,148,21,22,151,152,153,26,27,28,157,158,31,160,33,162,129,129,37,38,39,129,41,13,235,],
356    [129,129,130,3,132,5,134,28,8,14,10,11,140,141,14,15,144,145,146,19,148,21,22,151,152,153,26,27,28,157,158,159,160,33,162,129,129,37,38,39,129,41,14,235,],
357    [1,1,2,3,4,5,6,28,8,15,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,1,1,37,38,39,1,41,15,235,],
358    [129,129,130,3,132,5,134,28,8,16,10,11,140,141,14,15,144,145,146,19,148,21,22,151,24,25,26,27,28,157,158,31,160,33,162,129,129,37,38,39,129,41,16,235,],
359    [129,129,130,3,132,5,134,28,8,17,10,11,140,141,14,15,144,145,146,19,148,21,22,151,24,153,26,27,28,157,158,31,160,33,162,129,129,37,38,39,129,41,17,235,],
360    [1,1,130,51,132,5,134,28,8,18,10,11,140,141,14,15,144,145,18,51,148,21,22,151,152,153,26,27,28,29,30,31,32,33,162,1,1,37,38,39,1,41,18,235,],
361    [129,129,130,3,132,5,134,28,8,19,10,11,140,141,14,143,144,145,146,19,148,149,22,151,152,153,26,27,28,29,158,159,160,33,162,129,129,37,38,39,129,41,19,235,],
362    [129,129,130,3,132,5,134,28,8,20,10,11,140,141,14,15,144,145,146,19,148,21,22,151,152,153,26,27,28,157,158,31,160,33,162,129,129,37,38,39,129,41,20,235,],
363    [129,129,130,3,132,5,134,28,8,21,10,11,140,141,14,15,144,145,146,19,148,21,22,151,152,153,26,27,28,157,158,159,160,33,162,129,129,37,38,39,129,41,21,235,],
364    [1,1,130,3,132,5,134,28,8,22,10,11,140,141,14,15,144,145,18,19,148,149,22,151,152,153,26,27,28,29,158,159,160,33,162,1,1,37,38,39,1,41,22,235,],
365    [129,129,130,3,132,5,134,28,8,23,10,11,140,141,14,15,16,17,146,19,148,21,22,23,152,25,26,27,28,157,158,31,160,33,162,129,129,37,38,39,129,41,23,235,],
366    [129,129,130,3,132,5,134,28,8,24,10,11,140,141,14,15,144,145,146,19,148,21,22,151,24,153,26,27,28,157,158,31,160,33,162,129,129,37,38,39,129,41,24,235,],
367    [129,129,130,3,132,5,134,28,8,25,10,11,140,141,14,15,144,145,146,19,148,21,22,151,24,25,26,27,28,157,158,31,160,33,162,129,129,37,38,39,129,41,25,235,],
368    [193,193,194,195,196,197,198,220,200,193,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,193,193,229,230,231,193,233,193,235,],
369    [193,193,194,195,196,197,198,220,200,193,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,193,193,229,230,231,193,233,193,235,],
370    [129,129,130,3,132,5,134,28,8,28,10,11,140,141,14,15,144,145,146,19,148,149,22,151,152,153,26,27,28,157,158,159,160,33,162,129,129,37,38,39,129,41,28,235,],
371    [1,1,130,3,132,5,134,28,8,29,10,11,140,141,14,15,144,145,18,19,148,21,22,151,152,153,26,27,28,29,30,31,32,33,162,1,1,37,38,39,1,41,29,235,],
372    [1,1,2,3,4,5,6,28,8,30,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,1,1,46,38,39,1,41,30,235,],
373    [1,1,130,3,132,5,134,28,8,31,10,11,140,141,14,15,144,145,18,19,148,149,22,151,152,153,26,27,28,29,30,159,160,33,162,1,1,37,38,39,1,41,31,235,],
374    [1,1,130,3,132,5,134,28,8,32,10,11,12,13,14,15,16,17,18,19,20,149,22,23,24,25,26,27,28,29,30,159,160,33,162,1,1,37,38,39,1,41,32,235,],
375    [1,1,2,3,4,5,6,28,8,33,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,1,1,47,38,39,1,41,33,235,],
376    [129,129,130,3,132,5,134,28,8,34,10,11,140,141,14,15,144,145,146,19,148,149,22,151,152,153,26,27,28,157,158,159,160,33,52,129,129,37,38,39,129,41,34,235,],
377    [1,1,130,3,132,5,134,28,8,1,10,11,140,141,14,15,144,145,18,19,148,21,22,151,152,153,26,27,28,29,30,31,32,33,162,1,1,37,38,39,1,41,1,235,],
378    [1,1,130,3,132,5,134,28,8,1,10,11,140,141,14,15,144,145,18,19,148,21,22,151,152,153,26,27,28,29,30,31,32,33,162,1,1,37,38,39,1,41,1,235,],
379    [129,129,130,131,132,5,134,156,8,129,10,11,140,141,14,143,144,145,146,147,148,149,22,151,152,153,26,27,156,157,158,159,160,161,162,129,129,37,38,39,129,41,129,235,],
380    [129,129,130,3,132,5,134,28,8,38,10,11,140,141,14,15,144,145,18,19,148,149,22,151,152,153,26,27,28,29,158,159,160,33,162,129,129,37,38,39,129,41,38,235,],
381    [1,1,2,3,4,5,6,28,8,39,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,1,1,37,38,39,1,41,39,235,],
382    [1,1,130,3,132,5,134,28,8,1,10,11,140,141,14,15,144,145,18,19,148,21,22,151,152,153,26,27,28,29,30,31,32,33,162,1,1,37,38,39,1,41,1,235,],
383    [129,129,130,131,132,5,134,156,136,129,138,11,140,141,142,143,144,145,146,147,148,149,150,151,152,153,26,27,156,157,158,159,160,161,162,129,129,45,166,167,129,41,129,235,],
384    [1,1,130,3,132,5,134,28,8,42,10,11,140,141,14,15,144,145,18,19,148,21,22,151,152,153,26,27,28,29,30,31,32,33,162,1,1,37,38,39,1,41,42,235,],
385    [129,129,130,3,132,5,134,28,8,129,10,11,140,141,14,143,144,145,146,19,148,149,22,151,152,153,26,27,28,157,158,159,160,33,162,129,129,37,38,39,129,41,129,235,],
386    [1,1,2,3,4,5,6,28,8,1,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,1,1,37,38,39,1,41,1,43,],
387    [129,129,130,131,132,5,134,156,136,129,138,11,140,141,142,143,144,145,146,147,148,149,150,151,152,153,26,27,156,157,158,159,160,161,162,129,129,45,166,167,129,41,129,235,],
388    [1,1,2,3,4,5,6,28,8,1,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,1,1,46,38,39,1,41,1,235,],
389    [129,129,130,131,132,5,134,156,8,129,10,11,140,141,14,143,144,145,146,147,148,149,22,151,152,153,26,27,156,157,30,159,160,161,162,129,129,47,38,39,129,41,129,235,],
390    [129,129,130,131,132,5,134,28,8,129,10,11,140,141,14,143,144,145,146,147,148,149,22,151,152,153,26,27,28,157,158,159,160,161,162,129,129,48,38,39,129,41,129,235,],
391    [129,129,130,131,132,5,134,28,8,129,10,11,140,141,14,143,144,145,146,147,148,149,22,151,152,153,26,27,28,157,158,159,160,161,162,129,129,49,38,39,129,41,129,235,],
392    [129,129,2,131,132,5,134,156,8,129,10,11,140,141,14,143,144,145,146,147,148,149,22,151,152,153,26,27,156,157,158,159,160,161,162,129,129,50,38,39,129,41,129,235,],
393    [1,1,2,3,4,5,134,28,8,51,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,1,1,37,38,39,1,41,51,235,],
394    [129,129,130,3,132,5,134,28,8,52,10,11,140,141,14,15,144,145,146,19,148,149,22,151,152,153,26,27,28,157,158,159,160,33,162,129,129,37,38,39,129,41,52,235,],
395];