skrifa/
charmap.rs

1//! Mapping of characters (codepoints, not graphemes) to nominal glyph identifiers.
2//!
3//! If you have never run into character to glyph mapping before
4//! [Glyph IDs and the 'cmap' table](https://rsheeter.github.io/font101/#glyph-ids-and-the-cmap-table)
5//! might be informative.
6//!
7//! The functionality in this module provides a 1-to-1 mapping from Unicode
8//! characters (or [Unicode variation sequences](http://unicode.org/faq/vs.html)) to
9//! nominal or "default" internal glyph identifiers for a given font.
10//! This is a necessary first step, but generally insufficient for proper layout of
11//! [complex text](https://en.wikipedia.org/wiki/Complex_text_layout) or even
12//! simple text containing diacritics and ligatures.
13//!
14//! Comprehensive mapping of characters to positioned glyphs requires a process called
15//! shaping. For more detail, see: [Why do I need a shaping engine?](https://harfbuzz.github.io/why-do-i-need-a-shaping-engine.html)
16
17use read_fonts::{
18    tables::cmap::{
19        self, Cmap, Cmap12, Cmap12Iter, Cmap14, Cmap14Iter, Cmap4, Cmap4Iter, CmapIterLimits,
20        CmapSubtable, EncodingRecord, PlatformId,
21    },
22    types::GlyphId,
23    FontData, FontRef, TableProvider,
24};
25
26pub use read_fonts::tables::cmap::MapVariant;
27
28/// Mapping of characters to nominal glyph identifiers.
29///
30/// The mappings are derived from the [cmap](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap)
31/// table. Depending on the font, the returned mapping may have entries that point to virtual/phantom glyph
32/// ids beyond the `num_glyphs` entry of the `maxp` table, which are only used during the shaping process,
33/// for example.
34///
35/// ## Obtaining a Charmap
36///
37/// Typically a Charmap is acquired by calling [charmap](crate::MetadataProvider::charmap) on a [FontRef].
38///
39/// ## Selection strategy
40///
41/// Fonts may contain multiple subtables in various formats supporting different encodings. The selection
42/// strategy implemented here is designed to choose mappings that capture the broadest available Unicode
43/// coverage:
44///
45/// * Unicode characters: a symbol mapping subtable is selected if available. Otherwise, subtables supporting
46///   the Unicode full repertoire or Basic Multilingual Plane (BMP) are preferred, in that order. Formats
47///   [4](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-4-segment-mapping-to-delta-values)
48///   and [12](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-12-segmented-coverage) are
49///   supported.
50///
51/// * Unicode variation sequences: these are provided by a format
52///   [14](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-14-unicode-variation-sequences)
53///   subtable.
54///
55#[derive(Clone, Default)]
56pub struct Charmap<'a> {
57    codepoint_subtable: Option<CodepointSubtable<'a>>,
58    variant_subtable: Option<Cmap14<'a>>,
59    cmap12_limits: CmapIterLimits,
60}
61
62impl<'a> Charmap<'a> {
63    /// Creates a new character map from the given font.
64    pub fn new(font: &FontRef<'a>) -> Self {
65        let Ok(cmap) = font.cmap() else {
66            return Default::default();
67        };
68        let selection = MappingSelection::new(font, &cmap);
69        Self {
70            codepoint_subtable: selection
71                .codepoint_subtable
72                .map(|subtable| CodepointSubtable {
73                    subtable,
74                    is_symbol: selection.mapping_index.codepoint_subtable_is_symbol,
75                }),
76            variant_subtable: selection.variant_subtable,
77            cmap12_limits: selection.mapping_index.cmap12_limits,
78        }
79    }
80
81    /// Returns true if a suitable Unicode character mapping is available.
82    pub fn has_map(&self) -> bool {
83        self.codepoint_subtable.is_some()
84    }
85
86    /// Returns true if a symbol mapping was selected.
87    pub fn is_symbol(&self) -> bool {
88        self.codepoint_subtable
89            .as_ref()
90            .map(|x| x.is_symbol)
91            .unwrap_or(false)
92    }
93
94    /// Returns true if a Unicode variation sequence mapping is available.
95    pub fn has_variant_map(&self) -> bool {
96        self.variant_subtable.is_some()
97    }
98
99    /// Maps a character to a nominal glyph identifier.
100    ///
101    /// Returns `None` if a mapping does not exist.
102    pub fn map(&self, ch: impl Into<u32>) -> Option<GlyphId> {
103        self.codepoint_subtable.as_ref()?.map(ch.into())
104    }
105
106    /// Returns an iterator over all mappings of codepoint to nominal glyph
107    /// identifiers in the character map.
108    pub fn mappings(&self) -> Mappings<'a> {
109        self.codepoint_subtable
110            .as_ref()
111            .map(|subtable| {
112                Mappings(match &subtable.subtable {
113                    SupportedSubtable::Format4(cmap4) => MappingsInner::Format4(cmap4.iter()),
114                    SupportedSubtable::Format12(cmap12) => {
115                        MappingsInner::Format12(cmap12.iter_with_limits(self.cmap12_limits))
116                    }
117                })
118            })
119            .unwrap_or(Mappings(MappingsInner::None))
120    }
121
122    /// Maps a character and variation selector to a nominal glyph identifier.
123    ///
124    /// Returns `None` if a mapping does not exist.
125    pub fn map_variant(&self, ch: impl Into<u32>, selector: impl Into<u32>) -> Option<MapVariant> {
126        self.variant_subtable.as_ref()?.map_variant(ch, selector)
127    }
128
129    /// Returns an iterator over all mappings of character and variation
130    /// selector to nominal glyph identifier in the character map.
131    pub fn variant_mappings(&self) -> VariantMappings<'a> {
132        VariantMappings(self.variant_subtable.clone().map(|cmap14| cmap14.iter()))
133    }
134}
135
136/// Cacheable indices of selected mapping tables for materializing a character
137/// map.
138///
139/// Since [`Charmap`] carries a lifetime, it is difficult to store in a cache.
140/// This type serves as an acceleration structure that allows for construction
141/// of a character map while skipping the search for the most suitable Unicode
142/// mappings.
143#[derive(Copy, Clone, Default, Debug)]
144pub struct MappingIndex {
145    /// Index of Unicode or symbol mapping subtable.
146    codepoint_subtable: Option<u16>,
147    /// True if the above is a symbol mapping.
148    codepoint_subtable_is_symbol: bool,
149    /// Index of Unicode variation selector subtable.
150    variant_subtable: Option<u16>,
151    /// Limits for iterating a cmap format 12 subtable.
152    cmap12_limits: CmapIterLimits,
153}
154
155impl MappingIndex {
156    /// Finds the indices of the most suitable Unicode mapping tables in the
157    /// given font.
158    pub fn new(font: &FontRef) -> Self {
159        let Ok(cmap) = font.cmap() else {
160            return Default::default();
161        };
162        MappingSelection::new(font, &cmap).mapping_index
163    }
164
165    /// Creates a new character map for the given font using the tables referenced by
166    /// the precomputed indices.
167    ///
168    /// The font should be the same as the one used to construct this object.
169    pub fn charmap<'a>(&self, font: &FontRef<'a>) -> Charmap<'a> {
170        let Ok(cmap) = font.cmap() else {
171            return Default::default();
172        };
173        let records = cmap.encoding_records();
174        let data = cmap.offset_data();
175        Charmap {
176            codepoint_subtable: self
177                .codepoint_subtable
178                .and_then(|index| get_subtable(data, records, index))
179                .and_then(SupportedSubtable::new)
180                .map(|subtable| CodepointSubtable {
181                    subtable,
182                    is_symbol: self.codepoint_subtable_is_symbol,
183                }),
184            variant_subtable: self
185                .variant_subtable
186                .and_then(|index| get_subtable(data, records, index))
187                .and_then(|subtable| match subtable {
188                    CmapSubtable::Format14(cmap14) => Some(cmap14),
189                    _ => None,
190                }),
191            cmap12_limits: CmapIterLimits::default_for_font(font),
192        }
193    }
194}
195
196/// Iterator over all mappings of character to nominal glyph identifier
197/// in a character map.
198///
199/// This is created with the [`Charmap::mappings`] method.
200#[derive(Clone)]
201pub struct Mappings<'a>(MappingsInner<'a>);
202
203impl Iterator for Mappings<'_> {
204    type Item = (u32, GlyphId);
205
206    fn next(&mut self) -> Option<Self::Item> {
207        loop {
208            let item = match &mut self.0 {
209                MappingsInner::None => None,
210                MappingsInner::Format4(iter) => iter.next(),
211                MappingsInner::Format12(iter) => iter.next(),
212            }?;
213            if item.1 != GlyphId::NOTDEF {
214                return Some(item);
215            }
216        }
217    }
218}
219
220#[derive(Clone)]
221enum MappingsInner<'a> {
222    None,
223    Format4(Cmap4Iter<'a>),
224    Format12(Cmap12Iter<'a>),
225}
226
227/// Iterator over all mappings of character and variation selector to
228/// nominal glyph identifier in a character map.
229///
230/// This is created with the [`Charmap::variant_mappings`] method.
231#[derive(Clone)]
232pub struct VariantMappings<'a>(Option<Cmap14Iter<'a>>);
233
234impl Iterator for VariantMappings<'_> {
235    type Item = (u32, u32, MapVariant);
236
237    fn next(&mut self) -> Option<Self::Item> {
238        self.0.as_mut()?.next()
239    }
240}
241
242fn get_subtable<'a>(
243    data: FontData<'a>,
244    records: &[EncodingRecord],
245    index: u16,
246) -> Option<CmapSubtable<'a>> {
247    records
248        .get(index as usize)
249        .and_then(|record| record.subtable(data).ok())
250}
251
252#[derive(Clone)]
253struct CodepointSubtable<'a> {
254    subtable: SupportedSubtable<'a>,
255    /// True if the subtable is a symbol mapping.
256    is_symbol: bool,
257}
258
259impl CodepointSubtable<'_> {
260    fn map(&self, codepoint: u32) -> Option<GlyphId> {
261        self.map_impl(codepoint).or_else(|| {
262            if self.is_symbol && codepoint <= 0x00FF {
263                // From HarfBuzz:
264                // For symbol-encoded OpenType fonts, we duplicate the
265                // U+F000..F0FF range at U+0000..U+00FF.  That's what
266                // Windows seems to do, and that's hinted about at:
267                // https://docs.microsoft.com/en-us/typography/opentype/spec/recom
268                // under "Non-Standard (Symbol) Fonts".
269                // See <https://github.com/harfbuzz/harfbuzz/blob/453ded05392af38bba9f89587edce465e86ffa6b/src/hb-ot-cmap-table.hh#L1595>
270                self.map_impl(codepoint + 0xF000)
271            } else {
272                None
273            }
274        })
275    }
276
277    fn map_impl(&self, codepoint: u32) -> Option<GlyphId> {
278        let gid = match &self.subtable {
279            SupportedSubtable::Format4(subtable) => subtable.map_codepoint(codepoint),
280            SupportedSubtable::Format12(subtable) => subtable.map_codepoint(codepoint),
281        }?;
282        (gid != GlyphId::NOTDEF).then_some(gid)
283    }
284}
285
286#[derive(Clone)]
287enum SupportedSubtable<'a> {
288    Format4(Cmap4<'a>),
289    Format12(Cmap12<'a>),
290}
291
292impl<'a> SupportedSubtable<'a> {
293    fn new(subtable: CmapSubtable<'a>) -> Option<Self> {
294        Some(match subtable {
295            CmapSubtable::Format4(cmap4) => Self::Format4(cmap4),
296            CmapSubtable::Format12(cmap12) => Self::Format12(cmap12),
297            _ => return None,
298        })
299    }
300
301    fn from_cmap_record(cmap: &Cmap<'a>, record: &cmap::EncodingRecord) -> Option<Self> {
302        Self::new(record.subtable(cmap.offset_data()).ok()?)
303    }
304}
305
306/// The mapping kind of a cmap subtable.
307///
308/// The ordering is significant and determines the priority of subtable
309/// selection (greater is better).
310#[derive(Copy, Clone, PartialEq, PartialOrd)]
311enum MappingKind {
312    None = 0,
313    UnicodeBmp = 1,
314    UnicodeFull = 2,
315    Symbol = 3,
316}
317
318/// The result of searching the cmap table for the "best" available
319/// subtables.
320///
321/// For `codepoint_subtable`, best means either symbol (which is preferred)
322/// or a Unicode subtable with the greatest coverage.
323///
324/// For `variant_subtable`, best means a format 14 subtable.
325struct MappingSelection<'a> {
326    /// The mapping index accelerator that holds indices of the following
327    /// subtables.
328    mapping_index: MappingIndex,
329    /// Either a symbol subtable or the Unicode subtable with the
330    /// greatest coverage.
331    codepoint_subtable: Option<SupportedSubtable<'a>>,
332    /// Subtable that supports mapping Unicode variation sequences.
333    variant_subtable: Option<Cmap14<'a>>,
334}
335
336impl<'a> MappingSelection<'a> {
337    fn new(font: &FontRef<'a>, cmap: &Cmap<'a>) -> Self {
338        const ENCODING_MS_SYMBOL: u16 = 0;
339        const ENCODING_MS_UNICODE_CS: u16 = 1;
340        const ENCODING_APPLE_ID_UNICODE_32: u16 = 4;
341        const ENCODING_APPLE_ID_VARIANT_SELECTOR: u16 = 5;
342        const ENCODING_MS_ID_UCS_4: u16 = 10;
343        let mut mapping_index = MappingIndex::default();
344        let mut mapping_kind = MappingKind::None;
345        let mut codepoint_subtable = None;
346        let mut variant_subtable = None;
347        let mut maybe_choose_subtable = |kind, index, subtable| {
348            if kind > mapping_kind {
349                mapping_kind = kind;
350                mapping_index.codepoint_subtable_is_symbol = kind == MappingKind::Symbol;
351                mapping_index.codepoint_subtable = Some(index as u16);
352                codepoint_subtable = Some(subtable);
353            }
354        };
355        // This generally follows the same strategy as FreeType, searching the encoding
356        // records in reverse and prioritizing UCS-4 subtables over UCS-2.
357        // See <https://gitlab.freedesktop.org/freetype/freetype/-/blob/ac5babe87629107c43f627e2cd17c6cf4f2ecd43/src/base/ftobjs.c#L1370>
358        // The exception is that we prefer a symbol subtable over all others which matches the behavior
359        // of HarfBuzz.
360        // See <https://github.com/harfbuzz/harfbuzz/blob/453ded05392af38bba9f89587edce465e86ffa6b/src/hb-ot-cmap-table.hh#L1818>
361        for (i, record) in cmap.encoding_records().iter().enumerate().rev() {
362            match (record.platform_id(), record.encoding_id()) {
363                (PlatformId::Unicode, ENCODING_APPLE_ID_VARIANT_SELECTOR) => {
364                    // Unicode variation sequences
365                    if let Ok(CmapSubtable::Format14(subtable)) =
366                        record.subtable(cmap.offset_data())
367                    {
368                        if variant_subtable.is_none() {
369                            mapping_index.variant_subtable = Some(i as u16);
370                            variant_subtable = Some(subtable);
371                        }
372                    }
373                }
374                (PlatformId::Windows, ENCODING_MS_SYMBOL) => {
375                    // Symbol
376                    if let Some(subtable) = SupportedSubtable::from_cmap_record(cmap, record) {
377                        maybe_choose_subtable(MappingKind::Symbol, i, subtable);
378                    }
379                }
380                (PlatformId::Windows, ENCODING_MS_ID_UCS_4)
381                | (PlatformId::Unicode, ENCODING_APPLE_ID_UNICODE_32) => {
382                    // Unicode full repertoire
383                    if let Some(subtable) = SupportedSubtable::from_cmap_record(cmap, record) {
384                        maybe_choose_subtable(MappingKind::UnicodeFull, i, subtable);
385                    }
386                }
387                (PlatformId::ISO, _)
388                | (PlatformId::Unicode, _)
389                | (PlatformId::Windows, ENCODING_MS_UNICODE_CS) => {
390                    // Unicode BMP only
391                    if let Some(subtable) = SupportedSubtable::from_cmap_record(cmap, record) {
392                        maybe_choose_subtable(MappingKind::UnicodeBmp, i, subtable);
393                    }
394                }
395                _ => {}
396            }
397        }
398        mapping_index.cmap12_limits = CmapIterLimits::default_for_font(font);
399        Self {
400            mapping_index,
401            codepoint_subtable,
402            variant_subtable,
403        }
404    }
405}
406
407#[cfg(test)]
408mod tests {
409    use super::*;
410    use crate::MetadataProvider;
411    use read_fonts::FontRef;
412
413    #[test]
414    fn choose_format_12_over_4() {
415        let font = FontRef::new(font_test_data::CMAP12_FONT1).unwrap();
416        let charmap = font.charmap();
417        assert!(matches!(
418            charmap.codepoint_subtable.unwrap().subtable,
419            SupportedSubtable::Format12(..)
420        ));
421    }
422
423    #[test]
424    fn choose_format_4() {
425        let font = FontRef::new(font_test_data::VAZIRMATN_VAR).unwrap();
426        let charmap = font.charmap();
427        assert!(matches!(
428            charmap.codepoint_subtable.unwrap().subtable,
429            SupportedSubtable::Format4(..)
430        ));
431    }
432
433    #[test]
434    fn choose_symbol() {
435        let font = FontRef::new(font_test_data::CMAP4_SYMBOL_PUA).unwrap();
436        let charmap = font.charmap();
437        assert!(charmap.is_symbol());
438        assert!(matches!(
439            charmap.codepoint_subtable.unwrap().subtable,
440            SupportedSubtable::Format4(..)
441        ));
442    }
443
444    #[test]
445    fn map_format_4() {
446        let font = FontRef::new(font_test_data::VAZIRMATN_VAR).unwrap();
447        let charmap = font.charmap();
448        assert_eq!(charmap.map('A'), Some(GlyphId::new(1)));
449        assert_eq!(charmap.map('À'), Some(GlyphId::new(2)));
450        assert_eq!(charmap.map('`'), Some(GlyphId::new(3)));
451        assert_eq!(charmap.map('B'), None);
452    }
453
454    #[test]
455    fn map_format_12() {
456        let font = FontRef::new(font_test_data::CMAP12_FONT1).unwrap();
457        let charmap = font.charmap();
458        assert_eq!(charmap.map(' '), None);
459        assert_eq!(charmap.map(0x101723_u32), Some(GlyphId::new(1)));
460        assert_eq!(charmap.map(0x101725_u32), Some(GlyphId::new(3)));
461        assert_eq!(charmap.map(0x102523_u32), Some(GlyphId::new(6)));
462        assert_eq!(charmap.map(0x102526_u32), Some(GlyphId::new(9)));
463        assert_eq!(charmap.map(0x102527_u32), Some(GlyphId::new(10)));
464    }
465
466    #[test]
467    fn map_symbol_pua() {
468        let font = FontRef::new(font_test_data::CMAP4_SYMBOL_PUA).unwrap();
469        let charmap = font.charmap();
470        assert!(charmap.codepoint_subtable.as_ref().unwrap().is_symbol);
471        assert_eq!(charmap.map(0xF001_u32), Some(GlyphId::new(1)));
472        assert_eq!(charmap.map(0xF002_u32), Some(GlyphId::new(2)));
473        assert_eq!(charmap.map(0xF003_u32), Some(GlyphId::new(3)));
474        assert_eq!(charmap.map(0xF0FE_u32), Some(GlyphId::new(4)));
475        // The following don't exist in the cmap table and are remapped into the U+F000..F0FF range
476        // due to the selection of a symbol mapping subtable.
477        assert_eq!(charmap.map(0x1_u32), Some(GlyphId::new(1)));
478        assert_eq!(charmap.map(0x2_u32), Some(GlyphId::new(2)));
479        assert_eq!(charmap.map(0x3_u32), Some(GlyphId::new(3)));
480        assert_eq!(charmap.map(0xFE_u32), Some(GlyphId::new(4)));
481    }
482
483    #[test]
484    fn map_variants() {
485        use super::MapVariant::*;
486        let font = FontRef::new(font_test_data::CMAP14_FONT1).unwrap();
487        let charmap = font.charmap();
488        let selector = '\u{e0100}';
489        assert_eq!(charmap.map_variant('a', selector), None);
490        assert_eq!(charmap.map_variant('\u{4e00}', selector), Some(UseDefault));
491        assert_eq!(charmap.map_variant('\u{4e06}', selector), Some(UseDefault));
492        assert_eq!(
493            charmap.map_variant('\u{4e08}', selector),
494            Some(Variant(GlyphId::new(25)))
495        );
496        assert_eq!(
497            charmap.map_variant('\u{4e09}', selector),
498            Some(Variant(GlyphId::new(26)))
499        );
500    }
501
502    #[test]
503    fn mappings() {
504        for font_data in [
505            font_test_data::VAZIRMATN_VAR,
506            font_test_data::CMAP12_FONT1,
507            font_test_data::SIMPLE_GLYF,
508            font_test_data::CMAP4_SYMBOL_PUA,
509        ] {
510            let font = FontRef::new(font_data).unwrap();
511            let charmap = font.charmap();
512            for (codepoint, glyph_id) in charmap.mappings() {
513                assert_ne!(
514                    glyph_id,
515                    GlyphId::NOTDEF,
516                    "we should never encounter notdef glyphs"
517                );
518                assert_eq!(charmap.map(codepoint), Some(glyph_id));
519            }
520        }
521    }
522
523    #[test]
524    fn variant_mappings() {
525        let font = FontRef::new(font_test_data::CMAP14_FONT1).unwrap();
526        let charmap = font.charmap();
527        for (codepoint, selector, variant) in charmap.variant_mappings() {
528            assert_eq!(charmap.map_variant(codepoint, selector), Some(variant));
529        }
530    }
531}