rustybuzz/hb/
ot_shape_normalize.rs

1use super::buffer::*;
2use super::common::hb_codepoint_t;
3use super::hb_font_t;
4use super::ot_layout::*;
5use super::ot_shape_complex::MAX_COMBINING_MARKS;
6use super::ot_shape_plan::hb_ot_shape_plan_t;
7use super::unicode::{hb_unicode_funcs_t, CharExt};
8
9pub struct hb_ot_shape_normalize_context_t<'a> {
10    pub plan: &'a hb_ot_shape_plan_t,
11    pub buffer: &'a mut hb_buffer_t,
12    pub face: &'a hb_font_t<'a>,
13    pub decompose: fn(&hb_ot_shape_normalize_context_t, char) -> Option<(char, char)>,
14    pub compose: fn(&hb_ot_shape_normalize_context_t, char, char) -> Option<char>,
15}
16
17pub type hb_ot_shape_normalization_mode_t = i32;
18pub const HB_OT_SHAPE_NORMALIZATION_MODE_NONE: i32 = 0;
19pub const HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED: i32 = 1;
20pub const HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS: i32 = 2; /* Never composes base-to-base */
21pub const HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT: i32 = 3; /* Always fully decomposes and then recompose back */
22pub const HB_OT_SHAPE_NORMALIZATION_MODE_AUTO: i32 = 4; /* See hb-ot-shape-normalize.cc for logic. */
23#[allow(dead_code)]
24pub const HB_OT_SHAPE_NORMALIZATION_MODE_DEFAULT: i32 = HB_OT_SHAPE_NORMALIZATION_MODE_AUTO;
25
26// HIGHLEVEL DESIGN:
27//
28// This file exports one main function: normalize().
29//
30// This function closely reflects the Unicode Normalization Algorithm,
31// yet it's different.
32//
33// Each shaper specifies whether it prefers decomposed (NFD) or composed (NFC).
34// The logic however tries to use whatever the font can support.
35//
36// In general what happens is that: each grapheme is decomposed in a chain
37// of 1:2 decompositions, marks reordered, and then recomposed if desired,
38// so far it's like Unicode Normalization.  However, the decomposition and
39// recomposition only happens if the font supports the resulting characters.
40//
41// The goals are:
42//
43//   - Try to render all canonically equivalent strings similarly.  To really
44//     achieve this we have to always do the full decomposition and then
45//     selectively recompose from there.  It's kinda too expensive though, so
46//     we skip some cases.  For example, if composed is desired, we simply
47//     don't touch 1-character clusters that are supported by the font, even
48//     though their NFC may be different.
49//
50//   - When a font has a precomposed character for a sequence but the 'ccmp'
51//     feature in the font is not adequate, use the precomposed character
52//     which typically has better mark positioning.
53//
54//   - When a font does not support a combining mark, but supports it precomposed
55//     with previous base, use that.  This needs the itemizer to have this
56//     knowledge too.  We need to provide assistance to the itemizer.
57//
58//   - When a font does not support a character but supports its canonical
59//     decomposition, well, use the decomposition.
60//
61//   - The complex shapers can customize the compose and decompose functions to
62//     offload some of their requirements to the normalizer.  For example, the
63//     Indic shaper may want to disallow recomposing of two matras.
64
65fn decompose_unicode(
66    _: &hb_ot_shape_normalize_context_t,
67    ab: hb_codepoint_t,
68) -> Option<(hb_codepoint_t, hb_codepoint_t)> {
69    super::unicode::decompose(ab)
70}
71
72fn compose_unicode(
73    _: &hb_ot_shape_normalize_context_t,
74    a: hb_codepoint_t,
75    b: hb_codepoint_t,
76) -> Option<hb_codepoint_t> {
77    super::unicode::compose(a, b)
78}
79
80fn set_glyph(info: &mut hb_glyph_info_t, font: &hb_font_t) {
81    if let Some(glyph_id) = font.get_nominal_glyph(info.glyph_id) {
82        info.set_glyph_index(u32::from(glyph_id.0));
83    }
84}
85
86fn output_char(buffer: &mut hb_buffer_t, unichar: u32, glyph: u32) {
87    // This is very confusing indeed.
88    buffer.cur_mut(0).set_glyph_index(glyph);
89    buffer.output_glyph(unichar);
90    // TODO: should be _hb_glyph_info_set_unicode_props (&buffer->prev(), buffer);
91    let mut flags = buffer.scratch_flags;
92    buffer.prev_mut().init_unicode_props(&mut flags);
93    buffer.scratch_flags = flags;
94}
95
96fn next_char(buffer: &mut hb_buffer_t, glyph: u32) {
97    buffer.cur_mut(0).set_glyph_index(glyph);
98    buffer.next_glyph();
99}
100
101fn skip_char(buffer: &mut hb_buffer_t) {
102    buffer.skip_glyph();
103}
104
105/// Returns 0 if didn't decompose, number of resulting characters otherwise.
106fn decompose(ctx: &mut hb_ot_shape_normalize_context_t, shortest: bool, ab: hb_codepoint_t) -> u32 {
107    let (a, b) = match (ctx.decompose)(ctx, ab) {
108        Some(decomposed) => decomposed,
109        _ => return 0,
110    };
111
112    let a_glyph = ctx.face.get_nominal_glyph(u32::from(a));
113    let b_glyph = if b != '\0' {
114        match ctx.face.get_nominal_glyph(u32::from(b)) {
115            Some(glyph_id) => Some(glyph_id),
116            None => return 0,
117        }
118    } else {
119        None
120    };
121
122    if !shortest || a_glyph.is_none() {
123        let ret = decompose(ctx, shortest, a);
124        if ret != 0 {
125            if let Some(b_glyph) = b_glyph {
126                output_char(ctx.buffer, u32::from(b), u32::from(b_glyph.0));
127                return ret + 1;
128            }
129            return ret;
130        }
131    }
132
133    if let Some(a_glyph) = a_glyph {
134        // Output a and b.
135        output_char(ctx.buffer, u32::from(a), u32::from(a_glyph.0));
136        if let Some(b_glyph) = b_glyph {
137            output_char(ctx.buffer, u32::from(b), u32::from(b_glyph.0));
138            return 2;
139        }
140        return 1;
141    }
142
143    0
144}
145
146fn decompose_current_character(ctx: &mut hb_ot_shape_normalize_context_t, shortest: bool) {
147    let u = ctx.buffer.cur(0).as_char();
148    let glyph = ctx.face.get_nominal_glyph(u32::from(u));
149
150    // TODO: different to harfbuzz, sync
151    if !shortest || glyph.is_none() {
152        if decompose(ctx, shortest, u) > 0 {
153            skip_char(ctx.buffer);
154            return;
155        }
156    }
157
158    // TODO: different to harfbuzz, sync
159    if let Some(glyph) = glyph {
160        next_char(ctx.buffer, u32::from(glyph.0));
161        return;
162    }
163
164    if _hb_glyph_info_is_unicode_space(ctx.buffer.cur(0)) {
165        let space_type = u.space_fallback();
166        if space_type != hb_unicode_funcs_t::NOT_SPACE {
167            let space_glyph = ctx.face.get_nominal_glyph(0x0020).or(ctx.buffer.invisible);
168
169            if let Some(space_glyph) = space_glyph {
170                _hb_glyph_info_set_unicode_space_fallback_type(ctx.buffer.cur_mut(0), space_type);
171                next_char(ctx.buffer, u32::from(space_glyph.0));
172                ctx.buffer.scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_SPACE_FALLBACK;
173                return;
174            }
175        }
176    }
177
178    // U+2011 is the only sensible character that is a no-break version of another character
179    // and not a space.  The space ones are handled already.  Handle this lone one.
180    if u == '\u{2011}' {
181        if let Some(other_glyph) = ctx.face.get_nominal_glyph(0x2010) {
182            next_char(ctx.buffer, u32::from(other_glyph.0));
183            return;
184        }
185    }
186
187    // Insert a .notdef glyph if decomposition failed.
188    next_char(ctx.buffer, 0);
189}
190
191fn handle_variation_selector_cluster(
192    ctx: &mut hb_ot_shape_normalize_context_t,
193    end: usize,
194    _: bool,
195) {
196    let face = ctx.face;
197
198    // TODO: Currently if there's a variation-selector we give-up, it's just too hard.
199    let buffer = &mut ctx.buffer;
200    while buffer.idx < end - 1 && buffer.successful {
201        if buffer.cur(1).as_char().is_variation_selector() {
202            if let Some(glyph_id) =
203                face.glyph_variation_index(buffer.cur(0).as_char(), buffer.cur(1).as_char())
204            {
205                buffer.cur_mut(0).set_glyph_index(u32::from(glyph_id.0));
206                let unicode = buffer.cur(0).glyph_id;
207                buffer.replace_glyphs(2, 1, &[unicode]);
208            } else {
209                // Just pass on the two characters separately, let GSUB do its magic.
210                set_glyph(buffer.cur_mut(0), face);
211                buffer.next_glyph();
212                set_glyph(buffer.cur_mut(0), face);
213                buffer.next_glyph();
214            }
215
216            // Skip any further variation selectors.
217            while buffer.idx < end && buffer.cur(0).as_char().is_variation_selector() {
218                set_glyph(buffer.cur_mut(0), face);
219                buffer.next_glyph();
220            }
221        } else {
222            set_glyph(buffer.cur_mut(0), face);
223            buffer.next_glyph();
224        }
225    }
226
227    if ctx.buffer.idx < end {
228        set_glyph(ctx.buffer.cur_mut(0), face);
229        ctx.buffer.next_glyph();
230    }
231}
232
233fn decompose_multi_char_cluster(
234    ctx: &mut hb_ot_shape_normalize_context_t,
235    end: usize,
236    short_circuit: bool,
237) {
238    let mut i = ctx.buffer.idx;
239    while i < end && ctx.buffer.successful {
240        if ctx.buffer.info[i].as_char().is_variation_selector() {
241            handle_variation_selector_cluster(ctx, end, short_circuit);
242            return;
243        }
244        i += 1;
245    }
246
247    while ctx.buffer.idx < end && ctx.buffer.successful {
248        decompose_current_character(ctx, short_circuit);
249    }
250}
251
252fn compare_combining_class(pa: &hb_glyph_info_t, pb: &hb_glyph_info_t) -> bool {
253    let a = _hb_glyph_info_get_modified_combining_class(pa);
254    let b = _hb_glyph_info_get_modified_combining_class(pb);
255    a > b
256}
257
258pub fn _hb_ot_shape_normalize(
259    plan: &hb_ot_shape_plan_t,
260    buffer: &mut hb_buffer_t,
261    face: &hb_font_t,
262) {
263    if buffer.is_empty() {
264        return;
265    }
266
267    let mut mode = plan.shaper.normalization_preference;
268    if mode == HB_OT_SHAPE_NORMALIZATION_MODE_AUTO {
269        if plan.has_gpos_mark {
270            // https://github.com/harfbuzz/harfbuzz/issues/653#issuecomment-423905920
271            // mode = Some(HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED);
272            mode = HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS;
273        } else {
274            mode = HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS;
275        }
276    }
277
278    let mut ctx = hb_ot_shape_normalize_context_t {
279        plan,
280        buffer,
281        face,
282        decompose: plan.shaper.decompose.unwrap_or(decompose_unicode),
283        compose: plan.shaper.compose.unwrap_or(compose_unicode),
284    };
285    let mut buffer = &mut ctx.buffer;
286
287    let always_short_circuit = mode == HB_OT_SHAPE_NORMALIZATION_MODE_NONE;
288    let might_short_circuit = always_short_circuit
289        || (mode != HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED
290            && mode != HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT);
291
292    // We do a fairly straightforward yet custom normalization process in three
293    // separate rounds: decompose, reorder, recompose (if desired).  Currently
294    // this makes two buffer swaps.  We can make it faster by moving the last
295    // two rounds into the inner loop for the first round, but it's more readable
296    // this way.
297
298    // First round, decompose
299    let mut all_simple = true;
300    {
301        buffer.clear_output();
302        let count = buffer.len;
303        buffer.idx = 0;
304        loop {
305            let mut end = buffer.idx + 1;
306            while end < count && !_hb_glyph_info_is_unicode_mark(&buffer.info[end]) {
307                end += 1;
308            }
309
310            if end < count {
311                // Leave one base for the marks to cluster with.
312                end -= 1;
313            }
314
315            // From idx to end are simple clusters.
316            if might_short_circuit {
317                let len = end - buffer.idx;
318                let mut done = 0;
319                while done < len {
320                    let cur = buffer.cur_mut(done);
321                    cur.set_glyph_index(match face.get_nominal_glyph(cur.glyph_id) {
322                        Some(glyph_id) => u32::from(glyph_id.0),
323                        None => break,
324                    });
325                    done += 1;
326                }
327                buffer.next_glyphs(done);
328            }
329
330            while buffer.idx < end && buffer.successful {
331                decompose_current_character(&mut ctx, might_short_circuit);
332                buffer = &mut ctx.buffer;
333            }
334
335            if buffer.idx == count || !buffer.successful {
336                break;
337            }
338
339            all_simple = false;
340
341            // Find all the marks now.
342            end = buffer.idx + 1;
343            while end < count && _hb_glyph_info_is_unicode_mark(&buffer.info[end]) {
344                end += 1;
345            }
346
347            // idx to end is one non-simple cluster.
348            decompose_multi_char_cluster(&mut ctx, end, always_short_circuit);
349            buffer = &mut ctx.buffer;
350
351            if buffer.idx >= count || !buffer.successful {
352                break;
353            }
354        }
355
356        buffer.sync();
357    }
358
359    // Second round, reorder (inplace)
360    if !all_simple {
361        let count = buffer.len;
362        let mut i = 0;
363        while i < count {
364            if _hb_glyph_info_get_modified_combining_class(&buffer.info[i]) == 0 {
365                i += 1;
366                continue;
367            }
368
369            let mut end = i + 1;
370            while end < count && _hb_glyph_info_get_modified_combining_class(&buffer.info[end]) != 0
371            {
372                end += 1;
373            }
374
375            // We are going to do a O(n^2).  Only do this if the sequence is short.
376            if end - i <= MAX_COMBINING_MARKS {
377                buffer.sort(i, end, compare_combining_class);
378
379                if let Some(reorder_marks) = ctx.plan.shaper.reorder_marks {
380                    reorder_marks(ctx.plan, buffer, i, end);
381                }
382            }
383
384            i = end + 1;
385        }
386    }
387    if buffer.scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_CGJ != 0 {
388        // For all CGJ, check if it prevented any reordering at all.
389        // If it did NOT, then make it skippable.
390        // https://github.com/harfbuzz/harfbuzz/issues/554
391        for i in 1..buffer.len.saturating_sub(1) {
392            if buffer.info[i].glyph_id == 0x034F
393            /* CGJ */
394            {
395                let last = _hb_glyph_info_get_modified_combining_class(&buffer.info[i - 1]);
396                let next = _hb_glyph_info_get_modified_combining_class(&buffer.info[i + 1]);
397                if next == 0 || last <= next {
398                    buffer.info[i].unhide();
399                }
400            }
401        }
402    }
403
404    // Third round, recompose
405    if !all_simple
406        && buffer.successful
407        && (mode == HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS
408            || mode == HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT)
409    {
410        // As noted in the comment earlier, we don't try to combine
411        // ccc=0 chars with their previous Starter.
412
413        let count = buffer.len;
414        let mut starter = 0;
415        buffer.clear_output();
416        buffer.next_glyph();
417        while buffer.idx < count && buffer.successful {
418            // We don't try to compose a non-mark character with it's preceding starter.
419            // This is both an optimization to avoid trying to compose every two neighboring
420            // glyphs in most scripts AND a desired feature for Hangul.  Apparently Hangul
421            // fonts are not designed to mix-and-match pre-composed syllables and Jamo.
422            let cur = buffer.cur(0);
423            if _hb_glyph_info_is_unicode_mark(cur) &&
424                // If there's anything between the starter and this char, they should have CCC
425                // smaller than this character's.
426                (starter == buffer.out_len - 1
427                    || _hb_glyph_info_get_modified_combining_class(buffer.prev()) < _hb_glyph_info_get_modified_combining_class(cur))
428            {
429                let a = buffer.out_info()[starter].as_char();
430                let b = cur.as_char();
431                if let Some(composed) = (ctx.compose)(&ctx, a, b) {
432                    if let Some(glyph_id) = face.get_nominal_glyph(u32::from(composed)) {
433                        // Copy to out-buffer.
434                        buffer = &mut ctx.buffer;
435                        buffer.next_glyph();
436                        if !buffer.successful {
437                            return;
438                        }
439
440                        // Merge and remove the second composable.
441                        buffer.merge_out_clusters(starter, buffer.out_len);
442                        buffer.out_len -= 1;
443
444                        // Modify starter and carry on.
445                        let mut flags = buffer.scratch_flags;
446                        let info = &mut buffer.out_info_mut()[starter];
447                        info.glyph_id = u32::from(composed);
448                        info.set_glyph_index(u32::from(glyph_id.0));
449                        info.init_unicode_props(&mut flags);
450                        buffer.scratch_flags = flags;
451
452                        continue;
453                    }
454                }
455            }
456
457            // Blocked, or doesn't compose.
458            buffer = &mut ctx.buffer;
459            buffer.next_glyph();
460
461            if _hb_glyph_info_get_modified_combining_class(buffer.prev()) == 0 {
462                starter = buffer.out_len - 1;
463            }
464        }
465
466        buffer.sync();
467    }
468}