rustybuzz/hb/
unicode.rs

1use core::convert::TryFrom;
2
3pub use unicode_ccc::CanonicalCombiningClass;
4// TODO: prefer unic-ucd-normal::CanonicalCombiningClass
5pub use unicode_properties::GeneralCategory as hb_unicode_general_category_t;
6
7use crate::Script;
8
9// Space estimates based on:
10// https://unicode.org/charts/PDF/U2000.pdf
11// https://docs.microsoft.com/en-us/typography/develop/character-design-standards/whitespace
12pub mod hb_unicode_funcs_t {
13    pub type space_t = u8;
14    pub const NOT_SPACE: u8 = 0;
15    pub const SPACE_EM: u8 = 1;
16    pub const SPACE_EM_2: u8 = 2;
17    pub const SPACE_EM_3: u8 = 3;
18    pub const SPACE_EM_4: u8 = 4;
19    pub const SPACE_EM_5: u8 = 5;
20    pub const SPACE_EM_6: u8 = 6;
21    pub const SPACE_EM_16: u8 = 16;
22    pub const SPACE_4_EM_18: u8 = 17; // 4/18th of an EM!
23    pub const SPACE: u8 = 18;
24    pub const SPACE_FIGURE: u8 = 19;
25    pub const SPACE_PUNCTUATION: u8 = 20;
26    pub const SPACE_NARROW: u8 = 21;
27}
28
29#[allow(dead_code)]
30pub mod modified_combining_class {
31    // Hebrew
32    //
33    // We permute the "fixed-position" classes 10-26 into the order
34    // described in the SBL Hebrew manual:
35    //
36    // https://www.sbl-site.org/Fonts/SBLHebrewUserManual1.5x.pdf
37    //
38    // (as recommended by:
39    //  https://forum.fontlab.com/archive-old-microsoft-volt-group/vista-and-diacritic-ordering/msg22823/)
40    //
41    // More details here:
42    // https://bugzilla.mozilla.org/show_bug.cgi?id=662055
43    pub const CCC10: u8 = 22; // sheva
44    pub const CCC11: u8 = 15; // hataf segol
45    pub const CCC12: u8 = 16; // hataf patah
46    pub const CCC13: u8 = 17; // hataf qamats
47    pub const CCC14: u8 = 23; // hiriq
48    pub const CCC15: u8 = 18; // tsere
49    pub const CCC16: u8 = 19; // segol
50    pub const CCC17: u8 = 20; // patah
51    pub const CCC18: u8 = 21; // qamats & qamats qatan
52    pub const CCC19: u8 = 14; // holam & holam haser for vav
53    pub const CCC20: u8 = 24; // qubuts
54    pub const CCC21: u8 = 12; // dagesh
55    pub const CCC22: u8 = 25; // meteg
56    pub const CCC23: u8 = 13; // rafe
57    pub const CCC24: u8 = 10; // shin dot
58    pub const CCC25: u8 = 11; // sin dot
59    pub const CCC26: u8 = 26; // point varika
60
61    // Arabic
62    //
63    // Modify to move Shadda (ccc=33) before other marks.  See:
64    // https://unicode.org/faq/normalization.html#8
65    // https://unicode.org/faq/normalization.html#9
66    pub const CCC27: u8 = 28; // fathatan
67    pub const CCC28: u8 = 29; // dammatan
68    pub const CCC29: u8 = 30; // kasratan
69    pub const CCC30: u8 = 31; // fatha
70    pub const CCC31: u8 = 32; // damma
71    pub const CCC32: u8 = 33; // kasra
72    pub const CCC33: u8 = 27; // shadda
73    pub const CCC34: u8 = 34; // sukun
74    pub const CCC35: u8 = 35; // superscript alef
75
76    // Syriac
77    pub const CCC36: u8 = 36; // superscript alaph
78
79    // Telugu
80    //
81    // Modify Telugu length marks (ccc=84, ccc=91).
82    // These are the only matras in the main Indic scripts range that have
83    // a non-zero ccc.  That makes them reorder with the Halant that is
84    // ccc=9.  Just zero them, we don't need them in our Indic shaper.
85    pub const CCC84: u8 = 0; // length mark
86    pub const CCC91: u8 = 0; // ai length mark
87
88    // Thai
89    //
90    // Modify U+0E38 and U+0E39 (ccc=103) to be reordered before U+0E3A (ccc=9).
91    // Assign 3, which is unassigned otherwise.
92    // Uniscribe does this reordering too.
93    pub const CCC103: u8 = 3; // sara u / sara uu
94    pub const CCC107: u8 = 107; // mai *
95
96    // Lao
97    pub const CCC118: u8 = 118; // sign u / sign uu
98    pub const CCC122: u8 = 122; // mai *
99
100    // Tibetan
101    //
102    // In case of multiple vowel-signs, use u first (but after achung)
103    // this allows Dzongkha multi-vowel shortcuts to render correctly
104    pub const CCC129: u8 = 129; // sign aa
105    pub const CCC130: u8 = 132; // sign i
106    pub const CCC132: u8 = 131; // sign u
107}
108
109#[rustfmt::skip]
110const MODIFIED_COMBINING_CLASS: &[u8; 256] = &[
111    CanonicalCombiningClass::NotReordered as u8,
112    CanonicalCombiningClass::Overlay as u8,
113    2, 3, 4, 5, 6,
114    CanonicalCombiningClass::Nukta as u8,
115    CanonicalCombiningClass::KanaVoicing as u8,
116    CanonicalCombiningClass::Virama as u8,
117
118    // Hebrew
119    modified_combining_class::CCC10,
120    modified_combining_class::CCC11,
121    modified_combining_class::CCC12,
122    modified_combining_class::CCC13,
123    modified_combining_class::CCC14,
124    modified_combining_class::CCC15,
125    modified_combining_class::CCC16,
126    modified_combining_class::CCC17,
127    modified_combining_class::CCC18,
128    modified_combining_class::CCC19,
129    modified_combining_class::CCC20,
130    modified_combining_class::CCC21,
131    modified_combining_class::CCC22,
132    modified_combining_class::CCC23,
133    modified_combining_class::CCC24,
134    modified_combining_class::CCC25,
135    modified_combining_class::CCC26,
136
137    // Arabic
138    modified_combining_class::CCC27,
139    modified_combining_class::CCC28,
140    modified_combining_class::CCC29,
141    modified_combining_class::CCC30,
142    modified_combining_class::CCC31,
143    modified_combining_class::CCC32,
144    modified_combining_class::CCC33,
145    modified_combining_class::CCC34,
146    modified_combining_class::CCC35,
147
148    // Syriac
149    modified_combining_class::CCC36,
150
151    37, 38, 39,
152    40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
153    60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
154    80, 81, 82, 83,
155
156    // Telugu
157    modified_combining_class::CCC84,
158    85, 86, 87, 88, 89, 90,
159    modified_combining_class::CCC91,
160    92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
161
162    // Thai
163    modified_combining_class::CCC103,
164    104, 105, 106,
165    modified_combining_class::CCC107,
166    108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
167
168    // Lao
169    modified_combining_class::CCC118,
170    119, 120, 121,
171    modified_combining_class::CCC122,
172    123, 124, 125, 126, 127, 128,
173
174    // Tibetan
175    modified_combining_class::CCC129,
176    modified_combining_class::CCC130,
177    131,
178    modified_combining_class::CCC132,
179    133, 134, 135, 136, 137, 138, 139,
180
181
182    140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
183    150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
184    160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
185    170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
186    180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
187    190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
188
189    CanonicalCombiningClass::AttachedBelowLeft as u8,
190    201,
191    CanonicalCombiningClass::AttachedBelow as u8,
192    203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213,
193    CanonicalCombiningClass::AttachedAbove as u8,
194    215,
195    CanonicalCombiningClass::AttachedAboveRight as u8,
196    217,
197    CanonicalCombiningClass::BelowLeft as u8,
198    219,
199    CanonicalCombiningClass::Below as u8,
200    221,
201    CanonicalCombiningClass::BelowRight as u8,
202    223,
203    CanonicalCombiningClass::Left as u8,
204    225,
205    CanonicalCombiningClass::Right as u8,
206    227,
207    CanonicalCombiningClass::AboveLeft as u8,
208    229,
209    CanonicalCombiningClass::Above as u8,
210    231,
211    CanonicalCombiningClass::AboveRight as u8,
212    CanonicalCombiningClass::DoubleBelow as u8,
213    CanonicalCombiningClass::DoubleAbove as u8,
214    235, 236, 237, 238, 239,
215    CanonicalCombiningClass::IotaSubscript as u8,
216    241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
217    255, // RB_UNICODE_COMBINING_CLASS_INVALID
218];
219
220pub trait GeneralCategoryExt {
221    fn to_rb(&self) -> u32;
222    fn from_rb(gc: u32) -> Self;
223    fn is_mark(&self) -> bool;
224    fn is_letter(&self) -> bool;
225}
226
227#[rustfmt::skip]
228impl GeneralCategoryExt for hb_unicode_general_category_t {
229    fn to_rb(&self) -> u32 {
230        match *self {
231            hb_unicode_general_category_t::ClosePunctuation => hb_gc::RB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION,
232            hb_unicode_general_category_t::ConnectorPunctuation => hb_gc::RB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION,
233            hb_unicode_general_category_t::Control => hb_gc::RB_UNICODE_GENERAL_CATEGORY_CONTROL,
234            hb_unicode_general_category_t::CurrencySymbol => hb_gc::RB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL,
235            hb_unicode_general_category_t::DashPunctuation => hb_gc::RB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION,
236            hb_unicode_general_category_t::DecimalNumber => hb_gc::RB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER,
237            hb_unicode_general_category_t::EnclosingMark => hb_gc::RB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK,
238            hb_unicode_general_category_t::FinalPunctuation => hb_gc::RB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION,
239            hb_unicode_general_category_t::Format => hb_gc::RB_UNICODE_GENERAL_CATEGORY_FORMAT,
240            hb_unicode_general_category_t::InitialPunctuation => hb_gc::RB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION,
241            hb_unicode_general_category_t::LetterNumber => hb_gc::RB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER,
242            hb_unicode_general_category_t::LineSeparator => hb_gc::RB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR,
243            hb_unicode_general_category_t::LowercaseLetter => hb_gc::RB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER,
244            hb_unicode_general_category_t::MathSymbol => hb_gc::RB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL,
245            hb_unicode_general_category_t::ModifierLetter => hb_gc::RB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER,
246            hb_unicode_general_category_t::ModifierSymbol => hb_gc::RB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL,
247            hb_unicode_general_category_t::NonspacingMark => hb_gc::RB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK,
248            hb_unicode_general_category_t::OpenPunctuation => hb_gc::RB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION,
249            hb_unicode_general_category_t::OtherLetter => hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER,
250            hb_unicode_general_category_t::OtherNumber => hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER,
251            hb_unicode_general_category_t::OtherPunctuation => hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION,
252            hb_unicode_general_category_t::OtherSymbol => hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL,
253            hb_unicode_general_category_t::ParagraphSeparator => hb_gc::RB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR,
254            hb_unicode_general_category_t::PrivateUse => hb_gc::RB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE,
255            hb_unicode_general_category_t::SpaceSeparator => hb_gc::RB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR,
256            hb_unicode_general_category_t::SpacingMark => hb_gc::RB_UNICODE_GENERAL_CATEGORY_SPACING_MARK,
257            hb_unicode_general_category_t::Surrogate => hb_gc::RB_UNICODE_GENERAL_CATEGORY_SURROGATE,
258            hb_unicode_general_category_t::TitlecaseLetter => hb_gc::RB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER,
259            hb_unicode_general_category_t::Unassigned => hb_gc::RB_UNICODE_GENERAL_CATEGORY_UNASSIGNED,
260            hb_unicode_general_category_t::UppercaseLetter => hb_gc::RB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER,
261        }
262    }
263
264    fn from_rb(gc: u32) -> Self {
265        match gc {
266            hb_gc::RB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION => hb_unicode_general_category_t::ClosePunctuation,
267            hb_gc::RB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION => hb_unicode_general_category_t::ConnectorPunctuation,
268            hb_gc::RB_UNICODE_GENERAL_CATEGORY_CONTROL => hb_unicode_general_category_t::Control,
269            hb_gc::RB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL => hb_unicode_general_category_t::CurrencySymbol,
270            hb_gc::RB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION => hb_unicode_general_category_t::DashPunctuation,
271            hb_gc::RB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER => hb_unicode_general_category_t::DecimalNumber,
272            hb_gc::RB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK => hb_unicode_general_category_t::EnclosingMark,
273            hb_gc::RB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION => hb_unicode_general_category_t::FinalPunctuation,
274            hb_gc::RB_UNICODE_GENERAL_CATEGORY_FORMAT => hb_unicode_general_category_t::Format,
275            hb_gc::RB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION => hb_unicode_general_category_t::InitialPunctuation,
276            hb_gc::RB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER => hb_unicode_general_category_t::LetterNumber,
277            hb_gc::RB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR => hb_unicode_general_category_t::LineSeparator,
278            hb_gc::RB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER => hb_unicode_general_category_t::LowercaseLetter,
279            hb_gc::RB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL => hb_unicode_general_category_t::MathSymbol,
280            hb_gc::RB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER => hb_unicode_general_category_t::ModifierLetter,
281            hb_gc::RB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL => hb_unicode_general_category_t::ModifierSymbol,
282            hb_gc::RB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK => hb_unicode_general_category_t::NonspacingMark,
283            hb_gc::RB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION => hb_unicode_general_category_t::OpenPunctuation,
284            hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER => hb_unicode_general_category_t::OtherLetter,
285            hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER => hb_unicode_general_category_t::OtherNumber,
286            hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION => hb_unicode_general_category_t::OtherPunctuation,
287            hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL => hb_unicode_general_category_t::OtherSymbol,
288            hb_gc::RB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR => hb_unicode_general_category_t::ParagraphSeparator,
289            hb_gc::RB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE => hb_unicode_general_category_t::PrivateUse,
290            hb_gc::RB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR => hb_unicode_general_category_t::SpaceSeparator,
291            hb_gc::RB_UNICODE_GENERAL_CATEGORY_SPACING_MARK => hb_unicode_general_category_t::SpacingMark,
292            hb_gc::RB_UNICODE_GENERAL_CATEGORY_SURROGATE => hb_unicode_general_category_t::Surrogate,
293            hb_gc::RB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER => hb_unicode_general_category_t::TitlecaseLetter,
294            hb_gc::RB_UNICODE_GENERAL_CATEGORY_UNASSIGNED => hb_unicode_general_category_t::Unassigned,
295            hb_gc::RB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER => hb_unicode_general_category_t::UppercaseLetter,
296            _ => unreachable!(),
297        }
298    }
299
300    fn is_mark(&self) -> bool {
301        match *self {
302            hb_unicode_general_category_t::SpacingMark |
303            hb_unicode_general_category_t::EnclosingMark |
304            hb_unicode_general_category_t::NonspacingMark => true,
305            _ => false,
306        }
307    }
308
309    fn is_letter(&self) -> bool {
310        match *self {
311            hb_unicode_general_category_t::LowercaseLetter |
312            hb_unicode_general_category_t::ModifierLetter |
313            hb_unicode_general_category_t::OtherLetter |
314            hb_unicode_general_category_t::TitlecaseLetter |
315            hb_unicode_general_category_t::UppercaseLetter => true,
316            _ => false,
317        }
318    }
319}
320
321pub trait CharExt {
322    fn script(self) -> Script;
323    fn general_category(self) -> hb_unicode_general_category_t;
324    fn combining_class(self) -> CanonicalCombiningClass;
325    fn space_fallback(self) -> hb_unicode_funcs_t::space_t;
326    fn modified_combining_class(self) -> u8;
327    fn mirrored(self) -> Option<char>;
328    fn is_emoji_extended_pictographic(self) -> bool;
329    fn is_default_ignorable(self) -> bool;
330    fn is_variation_selector(self) -> bool;
331    fn vertical(self) -> Option<char>;
332}
333
334impl CharExt for char {
335    fn script(self) -> Script {
336        use crate::script;
337        use unicode_script as us;
338
339        match unicode_script::UnicodeScript::script(&self) {
340            us::Script::Common => script::COMMON,
341            us::Script::Inherited => script::INHERITED,
342            us::Script::Adlam => script::ADLAM,
343            us::Script::Ahom => script::AHOM,
344            us::Script::Anatolian_Hieroglyphs => script::ANATOLIAN_HIEROGLYPHS,
345            us::Script::Arabic => script::ARABIC,
346            us::Script::Armenian => script::ARMENIAN,
347            us::Script::Avestan => script::AVESTAN,
348            us::Script::Balinese => script::BALINESE,
349            us::Script::Bamum => script::BAMUM,
350            us::Script::Bassa_Vah => script::BASSA_VAH,
351            us::Script::Batak => script::BATAK,
352            us::Script::Bengali => script::BENGALI,
353            us::Script::Bhaiksuki => script::BHAIKSUKI,
354            us::Script::Bopomofo => script::BOPOMOFO,
355            us::Script::Brahmi => script::BRAHMI,
356            us::Script::Braille => script::BRAILLE,
357            us::Script::Buginese => script::BUGINESE,
358            us::Script::Buhid => script::BUHID,
359            us::Script::Canadian_Aboriginal => script::CANADIAN_SYLLABICS,
360            us::Script::Carian => script::CARIAN,
361            us::Script::Caucasian_Albanian => script::CAUCASIAN_ALBANIAN,
362            us::Script::Chakma => script::CHAKMA,
363            us::Script::Cham => script::CHAM,
364            us::Script::Cherokee => script::CHEROKEE,
365            us::Script::Chorasmian => script::CHORASMIAN,
366            us::Script::Coptic => script::COPTIC,
367            us::Script::Cuneiform => script::CUNEIFORM,
368            us::Script::Cypriot => script::CYPRIOT,
369            us::Script::Cyrillic => script::CYRILLIC,
370            us::Script::Deseret => script::DESERET,
371            us::Script::Devanagari => script::DEVANAGARI,
372            us::Script::Dives_Akuru => script::DIVES_AKURU,
373            us::Script::Dogra => script::DOGRA,
374            us::Script::Duployan => script::DUPLOYAN,
375            us::Script::Egyptian_Hieroglyphs => script::EGYPTIAN_HIEROGLYPHS,
376            us::Script::Elbasan => script::ELBASAN,
377            us::Script::Elymaic => script::ELYMAIC,
378            us::Script::Ethiopic => script::ETHIOPIC,
379            us::Script::Georgian => script::GEORGIAN,
380            us::Script::Glagolitic => script::GLAGOLITIC,
381            us::Script::Gothic => script::GOTHIC,
382            us::Script::Grantha => script::GRANTHA,
383            us::Script::Greek => script::GREEK,
384            us::Script::Gujarati => script::GUJARATI,
385            us::Script::Gunjala_Gondi => script::GUNJALA_GONDI,
386            us::Script::Gurmukhi => script::GURMUKHI,
387            us::Script::Han => script::HAN,
388            us::Script::Hangul => script::HANGUL,
389            us::Script::Hanifi_Rohingya => script::HANIFI_ROHINGYA,
390            us::Script::Hanunoo => script::HANUNOO,
391            us::Script::Hatran => script::HATRAN,
392            us::Script::Hebrew => script::HEBREW,
393            us::Script::Hiragana => script::HIRAGANA,
394            us::Script::Imperial_Aramaic => script::IMPERIAL_ARAMAIC,
395            us::Script::Inscriptional_Pahlavi => script::INSCRIPTIONAL_PAHLAVI,
396            us::Script::Inscriptional_Parthian => script::INSCRIPTIONAL_PARTHIAN,
397            us::Script::Javanese => script::JAVANESE,
398            us::Script::Kaithi => script::KAITHI,
399            us::Script::Kannada => script::KANNADA,
400            us::Script::Katakana => script::KATAKANA,
401            us::Script::Kayah_Li => script::KAYAH_LI,
402            us::Script::Kharoshthi => script::KHAROSHTHI,
403            us::Script::Khitan_Small_Script => script::KHITAN_SMALL_SCRIPT,
404            us::Script::Khmer => script::KHMER,
405            us::Script::Khojki => script::KHOJKI,
406            us::Script::Khudawadi => script::KHUDAWADI,
407            us::Script::Lao => script::LAO,
408            us::Script::Latin => script::LATIN,
409            us::Script::Lepcha => script::LEPCHA,
410            us::Script::Limbu => script::LIMBU,
411            us::Script::Linear_A => script::LINEAR_A,
412            us::Script::Linear_B => script::LINEAR_B,
413            us::Script::Lisu => script::LISU,
414            us::Script::Lycian => script::LYCIAN,
415            us::Script::Lydian => script::LYDIAN,
416            us::Script::Mahajani => script::MAHAJANI,
417            us::Script::Makasar => script::MAKASAR,
418            us::Script::Malayalam => script::MALAYALAM,
419            us::Script::Mandaic => script::MANDAIC,
420            us::Script::Manichaean => script::MANICHAEAN,
421            us::Script::Marchen => script::MARCHEN,
422            us::Script::Masaram_Gondi => script::MASARAM_GONDI,
423            us::Script::Medefaidrin => script::MEDEFAIDRIN,
424            us::Script::Meetei_Mayek => script::MEETEI_MAYEK,
425            us::Script::Mende_Kikakui => script::MENDE_KIKAKUI,
426            us::Script::Meroitic_Cursive => script::MEROITIC_CURSIVE,
427            us::Script::Meroitic_Hieroglyphs => script::MEROITIC_HIEROGLYPHS,
428            us::Script::Miao => script::MIAO,
429            us::Script::Modi => script::MODI,
430            us::Script::Mongolian => script::MONGOLIAN,
431            us::Script::Mro => script::MRO,
432            us::Script::Multani => script::MULTANI,
433            us::Script::Myanmar => script::MYANMAR,
434            us::Script::Nabataean => script::NABATAEAN,
435            us::Script::Nandinagari => script::NANDINAGARI,
436            us::Script::New_Tai_Lue => script::NEW_TAI_LUE,
437            us::Script::Newa => script::NEWA,
438            us::Script::Nko => script::NKO,
439            us::Script::Nushu => script::NUSHU,
440            us::Script::Nyiakeng_Puachue_Hmong => script::NYIAKENG_PUACHUE_HMONG,
441            us::Script::Ogham => script::OGHAM,
442            us::Script::Ol_Chiki => script::OL_CHIKI,
443            us::Script::Old_Hungarian => script::OLD_HUNGARIAN,
444            us::Script::Old_Italic => script::OLD_ITALIC,
445            us::Script::Old_North_Arabian => script::OLD_NORTH_ARABIAN,
446            us::Script::Old_Permic => script::OLD_PERMIC,
447            us::Script::Old_Persian => script::OLD_PERSIAN,
448            us::Script::Old_Sogdian => script::OLD_SOGDIAN,
449            us::Script::Old_South_Arabian => script::OLD_SOUTH_ARABIAN,
450            us::Script::Old_Turkic => script::OLD_TURKIC,
451            us::Script::Oriya => script::ORIYA,
452            us::Script::Osage => script::OSAGE,
453            us::Script::Osmanya => script::OSMANYA,
454            us::Script::Pahawh_Hmong => script::PAHAWH_HMONG,
455            us::Script::Palmyrene => script::PALMYRENE,
456            us::Script::Pau_Cin_Hau => script::PAU_CIN_HAU,
457            us::Script::Phags_Pa => script::PHAGS_PA,
458            us::Script::Phoenician => script::PHOENICIAN,
459            us::Script::Psalter_Pahlavi => script::PSALTER_PAHLAVI,
460            us::Script::Rejang => script::REJANG,
461            us::Script::Runic => script::RUNIC,
462            us::Script::Samaritan => script::SAMARITAN,
463            us::Script::Saurashtra => script::SAURASHTRA,
464            us::Script::Sharada => script::SHARADA,
465            us::Script::Shavian => script::SHAVIAN,
466            us::Script::Siddham => script::SIDDHAM,
467            us::Script::SignWriting => script::SIGNWRITING,
468            us::Script::Sinhala => script::SINHALA,
469            us::Script::Sogdian => script::SOGDIAN,
470            us::Script::Sora_Sompeng => script::SORA_SOMPENG,
471            us::Script::Soyombo => script::SOYOMBO,
472            us::Script::Sundanese => script::SUNDANESE,
473            us::Script::Syloti_Nagri => script::SYLOTI_NAGRI,
474            us::Script::Syriac => script::SYRIAC,
475            us::Script::Tagalog => script::TAGALOG,
476            us::Script::Tagbanwa => script::TAGBANWA,
477            us::Script::Tai_Le => script::TAI_LE,
478            us::Script::Tai_Tham => script::TAI_THAM,
479            us::Script::Tai_Viet => script::TAI_VIET,
480            us::Script::Takri => script::TAKRI,
481            us::Script::Tamil => script::TAMIL,
482            us::Script::Tangut => script::TANGUT,
483            us::Script::Telugu => script::TELUGU,
484            us::Script::Thaana => script::THAANA,
485            us::Script::Thai => script::THAI,
486            us::Script::Tibetan => script::TIBETAN,
487            us::Script::Tifinagh => script::TIFINAGH,
488            us::Script::Tirhuta => script::TIRHUTA,
489            us::Script::Ugaritic => script::UGARITIC,
490            us::Script::Vai => script::VAI,
491            us::Script::Wancho => script::WANCHO,
492            us::Script::Warang_Citi => script::WARANG_CITI,
493            us::Script::Yezidi => script::YEZIDI,
494            us::Script::Yi => script::YI,
495            us::Script::Zanabazar_Square => script::ZANABAZAR_SQUARE,
496            _ => script::UNKNOWN,
497        }
498    }
499
500    fn general_category(self) -> hb_unicode_general_category_t {
501        unicode_properties::general_category::UnicodeGeneralCategory::general_category(self)
502    }
503
504    fn combining_class(self) -> CanonicalCombiningClass {
505        unicode_ccc::get_canonical_combining_class(self)
506    }
507
508    fn space_fallback(self) -> hb_unicode_funcs_t::space_t {
509        use hb_unicode_funcs_t::*;
510
511        // All GC=Zs chars that can use a fallback.
512        match self {
513            '\u{0020}' => SPACE,             // SPACE
514            '\u{00A0}' => SPACE,             // NO-BREAK SPACE
515            '\u{2000}' => SPACE_EM_2,        // EN QUAD
516            '\u{2001}' => SPACE_EM,          // EM QUAD
517            '\u{2002}' => SPACE_EM_2,        // EN SPACE
518            '\u{2003}' => SPACE_EM,          // EM SPACE
519            '\u{2004}' => SPACE_EM_3,        // THREE-PER-EM SPACE
520            '\u{2005}' => SPACE_EM_4,        // FOUR-PER-EM SPACE
521            '\u{2006}' => SPACE_EM_6,        // SIX-PER-EM SPACE
522            '\u{2007}' => SPACE_FIGURE,      // FIGURE SPACE
523            '\u{2008}' => SPACE_PUNCTUATION, // PUNCTUATION SPACE
524            '\u{2009}' => SPACE_EM_5,        // THIN SPACE
525            '\u{200A}' => SPACE_EM_16,       // HAIR SPACE
526            '\u{202F}' => SPACE_NARROW,      // NARROW NO-BREAK SPACE
527            '\u{205F}' => SPACE_4_EM_18,     // MEDIUM MATHEMATICAL SPACE
528            '\u{3000}' => SPACE_EM,          // IDEOGRAPHIC SPACE
529            _ => NOT_SPACE,                  // OGHAM SPACE MARK
530        }
531    }
532
533    fn modified_combining_class(self) -> u8 {
534        let mut u = self;
535
536        // XXX This hack belongs to the Myanmar shaper.
537        if u == '\u{1037}' {
538            u = '\u{103A}';
539        }
540
541        // XXX This hack belongs to the USE shaper (for Tai Tham):
542        // Reorder SAKOT to ensure it comes after any tone marks.
543        if u == '\u{1A60}' {
544            return 254;
545        }
546
547        // XXX This hack belongs to the Tibetan shaper:
548        // Reorder PADMA to ensure it comes after any vowel marks.
549        if u == '\u{0FC6}' {
550            return 254;
551        }
552
553        // Reorder TSA -PHRU to reorder before U+0F74
554        if u == '\u{0F39}' {
555            return 127;
556        }
557
558        let k = unicode_ccc::get_canonical_combining_class(u);
559        MODIFIED_COMBINING_CLASS[k as usize]
560    }
561
562    fn mirrored(self) -> Option<char> {
563        unicode_bidi_mirroring::get_mirrored(self)
564    }
565
566    fn is_emoji_extended_pictographic(self) -> bool {
567        // Generated by scripts/gen-unicode-is-emoji-ext-pict.py
568        match self as u32 {
569            0x00A9 => true,
570            0x00AE => true,
571            0x203C => true,
572            0x2049 => true,
573            0x2122 => true,
574            0x2139 => true,
575            0x2194..=0x2199 => true,
576            0x21A9..=0x21AA => true,
577            0x231A..=0x231B => true,
578            0x2328 => true,
579            0x2388 => true,
580            0x23CF => true,
581            0x23E9..=0x23F3 => true,
582            0x23F8..=0x23FA => true,
583            0x24C2 => true,
584            0x25AA..=0x25AB => true,
585            0x25B6 => true,
586            0x25C0 => true,
587            0x25FB..=0x25FE => true,
588            0x2600..=0x2605 => true,
589            0x2607..=0x2612 => true,
590            0x2614..=0x2685 => true,
591            0x2690..=0x2705 => true,
592            0x2708..=0x2712 => true,
593            0x2714 => true,
594            0x2716 => true,
595            0x271D => true,
596            0x2721 => true,
597            0x2728 => true,
598            0x2733..=0x2734 => true,
599            0x2744 => true,
600            0x2747 => true,
601            0x274C => true,
602            0x274E => true,
603            0x2753..=0x2755 => true,
604            0x2757 => true,
605            0x2763..=0x2767 => true,
606            0x2795..=0x2797 => true,
607            0x27A1 => true,
608            0x27B0 => true,
609            0x27BF => true,
610            0x2934..=0x2935 => true,
611            0x2B05..=0x2B07 => true,
612            0x2B1B..=0x2B1C => true,
613            0x2B50 => true,
614            0x2B55 => true,
615            0x3030 => true,
616            0x303D => true,
617            0x3297 => true,
618            0x3299 => true,
619            0x1F000..=0x1F0FF => true,
620            0x1F10D..=0x1F10F => true,
621            0x1F12F => true,
622            0x1F16C..=0x1F171 => true,
623            0x1F17E..=0x1F17F => true,
624            0x1F18E => true,
625            0x1F191..=0x1F19A => true,
626            0x1F1AD..=0x1F1E5 => true,
627            0x1F201..=0x1F20F => true,
628            0x1F21A => true,
629            0x1F22F => true,
630            0x1F232..=0x1F23A => true,
631            0x1F23C..=0x1F23F => true,
632            0x1F249..=0x1F3FA => true,
633            0x1F400..=0x1F53D => true,
634            0x1F546..=0x1F64F => true,
635            0x1F680..=0x1F6FF => true,
636            0x1F774..=0x1F77F => true,
637            0x1F7D5..=0x1F7FF => true,
638            0x1F80C..=0x1F80F => true,
639            0x1F848..=0x1F84F => true,
640            0x1F85A..=0x1F85F => true,
641            0x1F888..=0x1F88F => true,
642            0x1F8AE..=0x1F8FF => true,
643            0x1F90C..=0x1F93A => true,
644            0x1F93C..=0x1F945 => true,
645            0x1F947..=0x1FFFD => true,
646            _ => false,
647        }
648    }
649
650    /// Default_Ignorable codepoints:
651    ///
652    /// Note: While U+115F, U+1160, U+3164 and U+FFA0 are Default_Ignorable,
653    /// we do NOT want to hide them, as the way Uniscribe has implemented them
654    /// is with regular spacing glyphs, and that's the way fonts are made to work.
655    /// As such, we make exceptions for those four.
656    /// Also ignoring U+1BCA0..1BCA3. https://github.com/harfbuzz/harfbuzz/issues/503
657    ///
658    /// Unicode 14.0:
659    /// $ grep '; Default_Ignorable_Code_Point ' DerivedCoreProperties.txt | sed 's/;.*#/#/'
660    /// 00AD          # Cf       SOFT HYPHEN
661    /// 034F          # Mn       COMBINING GRAPHEME JOINER
662    /// 061C          # Cf       ARABIC LETTER MARK
663    /// 115F..1160    # Lo   [2] HANGUL CHOSEONG FILLER..HANGUL JUNGSEONG FILLER
664    /// 17B4..17B5    # Mn   [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA
665    /// 180B..180D    # Mn   [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
666    /// 180E          # Cf       MONGOLIAN VOWEL SEPARATOR
667    /// 180F          # Mn       MONGOLIAN FREE VARIATION SELECTOR FOUR
668    /// 200B..200F    # Cf   [5] ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK
669    /// 202A..202E    # Cf   [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE
670    /// 2060..2064    # Cf   [5] WORD JOINER..INVISIBLE PLUS
671    /// 2065          # Cn       <reserved-2065>
672    /// 2066..206F    # Cf  [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES
673    /// 3164          # Lo       HANGUL FILLER
674    /// FE00..FE0F    # Mn  [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16
675    /// FEFF          # Cf       ZERO WIDTH NO-BREAK SPACE
676    /// FFA0          # Lo       HALFWIDTH HANGUL FILLER
677    /// FFF0..FFF8    # Cn   [9] <reserved-FFF0>..<reserved-FFF8>
678    /// 1BCA0..1BCA3  # Cf   [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
679    /// 1D173..1D17A  # Cf   [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE
680    /// E0000         # Cn       <reserved-E0000>
681    /// E0001         # Cf       LANGUAGE TAG
682    /// E0002..E001F  # Cn  [30] <reserved-E0002>..<reserved-E001F>
683    /// E0020..E007F  # Cf  [96] TAG SPACE..CANCEL TAG
684    /// E0080..E00FF  # Cn [128] <reserved-E0080>..<reserved-E00FF>
685    /// E0100..E01EF  # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
686    /// E01F0..E0FFF  # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
687    fn is_default_ignorable(self) -> bool {
688        let ch = u32::from(self);
689        let plane = ch >> 16;
690        if plane == 0 {
691            // BMP
692            let page = ch >> 8;
693            match page {
694                0x00 => ch == 0x00AD,
695                0x03 => ch == 0x034F,
696                0x06 => ch == 0x061C,
697                0x17 => (0x17B4..=0x17B5).contains(&ch),
698                0x18 => (0x180B..=0x180E).contains(&ch),
699                0x20 => {
700                    (0x200B..=0x200F).contains(&ch)
701                        || (0x202A..=0x202E).contains(&ch)
702                        || (0x2060..=0x206F).contains(&ch)
703                }
704                0xFE => (0xFE00..=0xFE0F).contains(&ch) || ch == 0xFEFF,
705                0xFF => (0xFFF0..=0xFFF8).contains(&ch),
706                _ => false,
707            }
708        } else {
709            // Other planes
710            match plane {
711                0x01 => (0x1D173..=0x1D17A).contains(&ch),
712                0x0E => (0xE0000..=0xE0FFF).contains(&ch),
713                _ => false,
714            }
715        }
716    }
717
718    fn is_variation_selector(self) -> bool {
719        // U+180B..180D, U+180F MONGOLIAN FREE VARIATION SELECTORs are handled in the
720        //Arabic shaper. No need to match them here.
721        let ch = u32::from(self);
722        (0x0FE00..=0x0FE0F).contains(&ch) || // VARIATION SELECTOR - 1..16
723        (0xE0100..=0xE01EF).contains(&ch) // VARIATION SELECTOR - 17..256
724    }
725
726    fn vertical(self) -> Option<char> {
727        Some(match u32::from(self) >> 8 {
728            0x20 => match self {
729                '\u{2013}' => '\u{fe32}', // EN DASH
730                '\u{2014}' => '\u{fe31}', // EM DASH
731                '\u{2025}' => '\u{fe30}', // TWO DOT LEADER
732                '\u{2026}' => '\u{fe19}', // HORIZONTAL ELLIPSIS
733                _ => return None,
734            },
735            0x30 => match self {
736                '\u{3001}' => '\u{fe11}', // IDEOGRAPHIC COMMA
737                '\u{3002}' => '\u{fe12}', // IDEOGRAPHIC FULL STOP
738                '\u{3008}' => '\u{fe3f}', // LEFT ANGLE BRACKET
739                '\u{3009}' => '\u{fe40}', // RIGHT ANGLE BRACKET
740                '\u{300a}' => '\u{fe3d}', // LEFT DOUBLE ANGLE BRACKET
741                '\u{300b}' => '\u{fe3e}', // RIGHT DOUBLE ANGLE BRACKET
742                '\u{300c}' => '\u{fe41}', // LEFT CORNER BRACKET
743                '\u{300d}' => '\u{fe42}', // RIGHT CORNER BRACKET
744                '\u{300e}' => '\u{fe43}', // LEFT WHITE CORNER BRACKET
745                '\u{300f}' => '\u{fe44}', // RIGHT WHITE CORNER BRACKET
746                '\u{3010}' => '\u{fe3b}', // LEFT BLACK LENTICULAR BRACKET
747                '\u{3011}' => '\u{fe3c}', // RIGHT BLACK LENTICULAR BRACKET
748                '\u{3014}' => '\u{fe39}', // LEFT TORTOISE SHELL BRACKET
749                '\u{3015}' => '\u{fe3a}', // RIGHT TORTOISE SHELL BRACKET
750                '\u{3016}' => '\u{fe17}', // LEFT WHITE LENTICULAR BRACKET
751                '\u{3017}' => '\u{fe18}', // RIGHT WHITE LENTICULAR BRACKET
752                _ => return None,
753            },
754            0xfe => match self {
755                '\u{fe4f}' => '\u{fe34}', // WAVY LOW LINE
756                _ => return None,
757            },
758            0xff => match self {
759                '\u{ff01}' => '\u{fe15}', // FULLWIDTH EXCLAMATION MARK
760                '\u{ff08}' => '\u{fe35}', // FULLWIDTH LEFT PARENTHESIS
761                '\u{ff09}' => '\u{fe36}', // FULLWIDTH RIGHT PARENTHESIS
762                '\u{ff0c}' => '\u{fe10}', // FULLWIDTH COMMA
763                '\u{ff1a}' => '\u{fe13}', // FULLWIDTH COLON
764                '\u{ff1b}' => '\u{fe14}', // FULLWIDTH SEMICOLON
765                '\u{ff1f}' => '\u{fe16}', // FULLWIDTH QUESTION MARK
766                '\u{ff3b}' => '\u{fe47}', // FULLWIDTH LEFT SQUARE BRACKET
767                '\u{ff3d}' => '\u{fe48}', // FULLWIDTH RIGHT SQUARE BRACKET
768                '\u{ff3f}' => '\u{fe33}', // FULLWIDTH LOW LINE
769                '\u{ff5b}' => '\u{fe37}', // FULLWIDTH LEFT CURLY BRACKET
770                '\u{ff5d}' => '\u{fe38}', // FULLWIDTH RIGHT CURLY BRACKET
771                _ => return None,
772            },
773            _ => return None,
774        })
775    }
776}
777
778const S_BASE: u32 = 0xAC00;
779const L_BASE: u32 = 0x1100;
780const V_BASE: u32 = 0x1161;
781const T_BASE: u32 = 0x11A7;
782const L_COUNT: u32 = 19;
783const V_COUNT: u32 = 21;
784const T_COUNT: u32 = 28;
785const N_COUNT: u32 = V_COUNT * T_COUNT;
786const S_COUNT: u32 = L_COUNT * N_COUNT;
787
788pub fn compose(a: char, b: char) -> Option<char> {
789    if let Some(ab) = compose_hangul(a, b) {
790        return Some(ab);
791    }
792
793    let needle = (a as u64) << 32 | (b as u64);
794    super::unicode_norm::COMPOSITION_TABLE
795        .binary_search_by(|item| item.0.cmp(&needle))
796        .map(|idx| super::unicode_norm::COMPOSITION_TABLE[idx].1)
797        .ok()
798}
799
800fn compose_hangul(a: char, b: char) -> Option<char> {
801    let l = u32::from(a);
802    let v = u32::from(b);
803    if L_BASE <= l && l < (L_BASE + L_COUNT) && V_BASE <= v && v < (V_BASE + V_COUNT) {
804        let r = S_BASE + (l - L_BASE) * N_COUNT + (v - V_BASE) * T_COUNT;
805        Some(char::try_from(r).unwrap())
806    } else if S_BASE <= l
807        && l <= (S_BASE + S_COUNT - T_COUNT)
808        && T_BASE <= v
809        && v < (T_BASE + T_COUNT)
810        && (l - S_BASE) % T_COUNT == 0
811    {
812        let r = l + (v - T_BASE);
813        Some(char::try_from(r).unwrap())
814    } else {
815        None
816    }
817}
818
819pub fn decompose(ab: char) -> Option<(char, char)> {
820    if let Some(ab) = decompose_hangul(ab) {
821        return Some(ab);
822    }
823
824    super::unicode_norm::DECOMPOSITION_TABLE
825        .binary_search_by(|item| item.0.cmp(&ab))
826        .map(|idx| {
827            let chars = &super::unicode_norm::DECOMPOSITION_TABLE[idx];
828            (chars.1, chars.2.unwrap_or('\0'))
829        })
830        .ok()
831}
832
833pub fn decompose_hangul(ab: char) -> Option<(char, char)> {
834    let si = u32::from(ab).wrapping_sub(S_BASE);
835    if si >= S_COUNT {
836        return None;
837    }
838
839    let (a, b) = if si % T_COUNT != 0 {
840        // LV,T
841        (S_BASE + (si / T_COUNT) * T_COUNT, T_BASE + (si % T_COUNT))
842    } else {
843        // L,V
844        (L_BASE + (si / N_COUNT), V_BASE + (si % N_COUNT) / T_COUNT)
845    };
846
847    Some((char::try_from(a).unwrap(), char::try_from(b).unwrap()))
848}
849
850#[cfg(test)]
851mod tests {
852    #[test]
853    fn check_unicode_version() {
854        assert_eq!(unicode_bidi_mirroring::UNICODE_VERSION, (14, 0, 0));
855        assert_eq!(unicode_ccc::UNICODE_VERSION, (14, 0, 0));
856        assert_eq!(unicode_properties::UNICODE_VERSION, (15, 0, 0));
857        assert_eq!(unicode_script::UNICODE_VERSION, (15, 1, 0));
858        assert_eq!(crate::hb::unicode_norm::UNICODE_VERSION, (14, 0, 0));
859    }
860}
861
862// TODO: remove
863pub mod hb_gc {
864    pub const RB_UNICODE_GENERAL_CATEGORY_CONTROL: u32 = 0;
865    pub const RB_UNICODE_GENERAL_CATEGORY_FORMAT: u32 = 1;
866    pub const RB_UNICODE_GENERAL_CATEGORY_UNASSIGNED: u32 = 2;
867    pub const RB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE: u32 = 3;
868    pub const RB_UNICODE_GENERAL_CATEGORY_SURROGATE: u32 = 4;
869    pub const RB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER: u32 = 5;
870    pub const RB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER: u32 = 6;
871    pub const RB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER: u32 = 7;
872    pub const RB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER: u32 = 8;
873    pub const RB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER: u32 = 9;
874    pub const RB_UNICODE_GENERAL_CATEGORY_SPACING_MARK: u32 = 10;
875    pub const RB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK: u32 = 11;
876    pub const RB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK: u32 = 12;
877    pub const RB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER: u32 = 13;
878    pub const RB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER: u32 = 14;
879    pub const RB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER: u32 = 15;
880    pub const RB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION: u32 = 16;
881    pub const RB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION: u32 = 17;
882    pub const RB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION: u32 = 18;
883    pub const RB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION: u32 = 19;
884    pub const RB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION: u32 = 20;
885    pub const RB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION: u32 = 21;
886    pub const RB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION: u32 = 22;
887    pub const RB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL: u32 = 23;
888    pub const RB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL: u32 = 24;
889    pub const RB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL: u32 = 25;
890    pub const RB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL: u32 = 26;
891    pub const RB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR: u32 = 27;
892    pub const RB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR: u32 = 28;
893    pub const RB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR: u32 = 29;
894}