icu_properties/
runtime.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

//! 🚧 \[Experimental\] This module is experimental and currently crate-private. Let us know if you
//! have a use case for this!
//!
//! This module contains utilities for working with properties where the specific property in use
//! is not known at compile time.
//!
//! For regex engines, [`crate::sets::load_for_ecma262_unstable()`] is a convenient API for working
//! with properties at runtime tailored for the use case of ECMA262-compatible regex engines.

#[cfg(doc)]
use super::{maps, script, GeneralCategory, GeneralCategoryGroup, Script};

/// This type can represent any Unicode property.
///
/// This is intended to be used in situations where the exact unicode property needed is
/// only known at runtime, for example in regex engines.
///
/// The values are intended to be identical to ICU4C's UProperty enum
#[allow(clippy::exhaustive_structs)] // newtype
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
pub struct UnicodeProperty(pub u32);

#[allow(non_upper_case_globals)]
#[allow(unused)] // experimental, may be made public later
impl UnicodeProperty {
    /// Binary property `Alphabetic`
    pub const Alphabetic: Self = UnicodeProperty(0);
    /// Binary property `ASCII_Hex_Digit`
    pub const AsciiHexDigit: Self = UnicodeProperty(1);
    /// Binary property `Bidi_Control`
    pub const BidiControl: Self = UnicodeProperty(2);
    /// Binary property `Bidi_Mirrored`
    pub const BidiMirrored: Self = UnicodeProperty(3);
    /// Binary property `Dash`
    pub const Dash: Self = UnicodeProperty(4);
    /// Binary property `Default_Ignorable_Code_Point`
    pub const DefaultIgnorableCodePoint: Self = UnicodeProperty(5);
    /// Binary property `Deprecated`
    pub const Deprecated: Self = UnicodeProperty(6);
    /// Binary property `Diacritic`
    pub const Diacritic: Self = UnicodeProperty(7);
    /// Binary property `Extender`
    pub const Extender: Self = UnicodeProperty(8);
    /// Binary property `Full_Composition_Exclusion`
    pub const FullCompositionExclusion: Self = UnicodeProperty(9);
    /// Binary property `Grapheme_Base`
    pub const GraphemeBase: Self = UnicodeProperty(10);
    /// Binary property `Grapheme_Extend`
    pub const GraphemeExtend: Self = UnicodeProperty(11);
    /// Binary property `Grapheme_Link`
    pub const GraphemeLink: Self = UnicodeProperty(12);
    /// Binary property `Hex_Digit`
    pub const HexDigit: Self = UnicodeProperty(13);
    /// Binary property `Hyphen`
    pub const Hyphen: Self = UnicodeProperty(14);
    /// Binary property `ID_Continue`
    pub const IdContinue: Self = UnicodeProperty(15);
    /// Binary property `ID_Start`
    pub const IdStart: Self = UnicodeProperty(16);
    /// Binary property `Ideographic`
    pub const Ideographic: Self = UnicodeProperty(17);
    /// Binary property `IDS_Binary_Operator`
    pub const IdsBinaryOperator: Self = UnicodeProperty(18);
    /// Binary property `IDS_Trinary_Operator`
    pub const IdsTrinaryOperator: Self = UnicodeProperty(19);
    /// Binary property `Join_Control`
    pub const JoinControl: Self = UnicodeProperty(20);
    /// Binary property `Logical_Order_Exception`
    pub const LogicalOrderException: Self = UnicodeProperty(21);
    /// Binary property `Lowercase`
    pub const Lowercase: Self = UnicodeProperty(22);
    /// Binary property `Math`
    pub const Math: Self = UnicodeProperty(23);
    /// Binary property `Noncharacter_Code_Point`
    pub const NoncharacterCodePoint: Self = UnicodeProperty(24);
    /// Binary property `Quotation_Mark`
    pub const QuotationMark: Self = UnicodeProperty(25);
    /// Binary property `Radical`
    pub const Radical: Self = UnicodeProperty(26);
    /// Binary property `Soft_Dotted`
    pub const SoftDotted: Self = UnicodeProperty(27);
    /// Binary property `Terminal_Punctuation`
    pub const TerminalPunctuation: Self = UnicodeProperty(28);
    /// Binary property `Unified_Ideograph`
    pub const UnifiedIdeograph: Self = UnicodeProperty(29);
    /// Binary property `Uppercase`
    pub const Uppercase: Self = UnicodeProperty(30);
    /// Binary property `White_Space`
    pub const WhiteSpace: Self = UnicodeProperty(31);
    /// Binary property `XID_Continue`
    pub const XidContinue: Self = UnicodeProperty(32);
    /// Binary property `XID_Start`
    pub const XidStart: Self = UnicodeProperty(33);
    /// Binary property `Case_Sensitive`
    pub const CaseSensitive: Self = UnicodeProperty(34);
    /// Binary property `Sentence_Terminal`
    pub const SentenceTerminal: Self = UnicodeProperty(35);
    /// Binary property `Variation_Selector`
    pub const VariationSelector: Self = UnicodeProperty(36);
    /// Binary property `NFD_Inert`
    pub const NfdInert: Self = UnicodeProperty(37);
    /// Binary property `NFKD_Inert`
    pub const NfkdInert: Self = UnicodeProperty(38);
    /// Binary property `NFC_Inert`
    pub const NfcInert: Self = UnicodeProperty(39);
    /// Binary property `NFKC_Inert`
    pub const NfkcInert: Self = UnicodeProperty(40);
    /// Binary property `Segment_Starter`
    pub const SegmentStarter: Self = UnicodeProperty(41);
    /// Binary property `Pattern_Syntax`
    pub const PatternSyntax: Self = UnicodeProperty(42);
    /// Binary property `Pattern_White_Space`
    pub const PatternWhiteSpace: Self = UnicodeProperty(43);
    /// Binary property `alnum`
    pub const Alnum: Self = UnicodeProperty(44);
    /// Binary property `blank`
    pub const Blank: Self = UnicodeProperty(45);
    /// Binary property `graph`
    pub const Graph: Self = UnicodeProperty(46);
    /// Binary property `print`
    pub const Print: Self = UnicodeProperty(47);
    /// Binary property `xdigit`
    pub const XDigit: Self = UnicodeProperty(48);
    /// Binary property `Cased`
    pub const Cased: Self = UnicodeProperty(49);
    /// Binary property `Case_Ignorable`
    pub const CaseIgnorable: Self = UnicodeProperty(50);
    /// Binary property `Changes_When_Lowercased`
    pub const ChangesWhenLowercased: Self = UnicodeProperty(51);
    /// Binary property `Changes_When_Uppercased`
    pub const ChangesWhenUppercased: Self = UnicodeProperty(52);
    /// Binary property `Changes_When_Titlecased`
    pub const ChangesWhenTitlecased: Self = UnicodeProperty(53);
    /// Binary property `Changes_When_Casefolded`
    pub const ChangesWhenCasefolded: Self = UnicodeProperty(54);
    /// Binary property `Changes_When_Casemapped`
    pub const ChangesWhenCasemapped: Self = UnicodeProperty(55);
    /// Binary property `Changes_When_NFKC_Casefolded`
    pub const ChangesWhenNfkcCasefolded: Self = UnicodeProperty(56);
    /// Binary property `Emoji`
    pub const Emoji: Self = UnicodeProperty(57);
    /// Binary property `Emoji_Presentation`
    pub const EmojiPresentation: Self = UnicodeProperty(58);
    /// Binary property `Emoji_Modifier`
    pub const EmojiModifier: Self = UnicodeProperty(59);
    /// Binary property `Emoji_Modifier_Base`
    pub const EmojiModifierBase: Self = UnicodeProperty(60);
    /// Binary property `Emoji_Component`
    pub const EmojiComponent: Self = UnicodeProperty(61);
    /// Binary property `Regional_Indicator`
    pub const RegionalIndicator: Self = UnicodeProperty(62);
    /// Binary property `Prepended_Concatenation_Mark`
    pub const PrependedConcatenationMark: Self = UnicodeProperty(63);
    /// Binary property `Extended_Pictographic`
    pub const ExtendedPictographic: Self = UnicodeProperty(64);
    /// Binary property `Basic_Emoji`
    pub const BasicEmoji: Self = UnicodeProperty(65);
    /// Binary property `Emoji_Keycap_Sequence`
    pub const EmojiKeycapSequence: Self = UnicodeProperty(66);
    /// Binary property `RGI_Emoji_Modifier_Sequence`
    pub const RgiEmojiModifierSequence: Self = UnicodeProperty(67);
    /// Binary property `RGI_Emoji_Flag_Sequence`
    pub const RgiEmojiFlagSequence: Self = UnicodeProperty(68);
    /// Binary property `RGI_Emoji_Tag_Sequence`
    pub const RgiEmojiTagSequence: Self = UnicodeProperty(69);
    /// Binary property `RGI_Emoji_ZWJ_Sequence`
    pub const RgiEmojiZWJSequence: Self = UnicodeProperty(70);
    /// Binary property `RGI_Emoji`
    pub const RgiEmoji: Self = UnicodeProperty(71);

    const BINARY_MAX: Self = Self::RgiEmoji;

    /// Enumerated property `Bidi_Class`
    pub const BidiClass: Self = UnicodeProperty(0x1000);
    /// Enumerated property `Block`
    pub const Block: Self = UnicodeProperty(0x1001);
    /// Enumerated property `Canonical_Combining_Class`
    pub const CombiningClass: Self = UnicodeProperty(0x1002);
    /// Enumerated property `Decomposition_Type`
    pub const DecompositionType: Self = UnicodeProperty(0x1003);
    /// Enumerated property `East_Asian_Width`
    pub const EastAsianWidth: Self = UnicodeProperty(0x1004);
    /// Enumerated property `General_Category`
    pub const GeneralCategory: Self = UnicodeProperty(0x1005);
    /// Enumerated property `Joining_Group`
    pub const JoiningGroup: Self = UnicodeProperty(0x1006);
    /// Enumerated property `Joining_Type`
    pub const JoiningType: Self = UnicodeProperty(0x1007);
    /// Enumerated property `Line_Break`
    pub const LineBreak: Self = UnicodeProperty(0x1008);
    /// Enumerated property `Numeric_Type`
    pub const NumericType: Self = UnicodeProperty(0x1009);
    /// Enumerated property `Script`
    pub const Script: Self = UnicodeProperty(0x100A);
    /// Enumerated property `Hangul_Syllable_Type`
    pub const HangulSyllableType: Self = UnicodeProperty(0x100B);
    /// Enumerated property `NFD_Quick_Check`
    pub const NFDQuickCheck: Self = UnicodeProperty(0x100C);
    /// Enumerated property `NFKD_Quick_Check`
    pub const NFKDQuickCheck: Self = UnicodeProperty(0x100D);
    /// Enumerated property `NFC_Quick_Check`
    pub const NFCQuickCheck: Self = UnicodeProperty(0x100E);
    /// Enumerated property `NFKC_Quick_Check`
    pub const NFKCQuickCheck: Self = UnicodeProperty(0x100F);
    /// Enumerated property `Lead_Canonical_Combining_Class`
    pub const LeadCanonicalCombiningClass: Self = UnicodeProperty(0x1010);
    /// Enumerated property `Trail_Canonical_Combining_Class`
    pub const TrailCanonicalCombiningClass: Self = UnicodeProperty(0x1011);
    /// Enumerated property `Grapheme_Cluster_Break`
    pub const GraphemeClusterBreak: Self = UnicodeProperty(0x1012);
    /// Enumerated property `Sentence_Break`
    pub const SentenceBreak: Self = UnicodeProperty(0x1013);
    /// Enumerated property `Word_Break`
    pub const WordBreak: Self = UnicodeProperty(0x1014);
    /// Enumerated property `Bidi_Paired_Bracket_Type`
    pub const BidiPairedBracketType: Self = UnicodeProperty(0x1015);
    /// Enumerated property `Indic_Positional_Category`
    pub const IndicPositionalCategory: Self = UnicodeProperty(0x1016);
    /// Enumerated property `Indic_Syllabic_Category`
    pub const IndicSyllabicCategory: Self = UnicodeProperty(0x1017);
    /// Enumerated property `Vertical_Orientation`
    pub const VerticalOrientation: Self = UnicodeProperty(0x1018);

    const ENUMERATED_MAX: Self = Self::VerticalOrientation;

    /// Mask property `General_Category_Mask`
    pub const GeneralCategoryMask: Self = UnicodeProperty(0x2000);

    /// Double property `Numeric_Value`
    pub const NumericValue: Self = UnicodeProperty(0x3000);

    /// String property `Age`
    pub const Age: Self = UnicodeProperty(0x4000);
    /// String property `Bidi_Mirroring_Glyph`
    pub const BidiMirroringGlyph: Self = UnicodeProperty(0x4001);
    /// String property `Case_Folding`
    pub const CaseFolding: Self = UnicodeProperty(0x4002);
    /// String property `ISO_Comment`
    pub const ISOComment: Self = UnicodeProperty(0x4003);
    /// String property `Lowercase_Mapping`
    pub const LowercaseMapping: Self = UnicodeProperty(0x4004);
    /// String property `Name`
    pub const Name: Self = UnicodeProperty(0x4005);
    /// String property `Simple_Case_Folding`
    pub const SimpleCaseFolding: Self = UnicodeProperty(0x4006);
    /// String property `Simple_Lowercase_Mapping`
    pub const SimpleLowercaseMapping: Self = UnicodeProperty(0x4007);
    /// String property `Simple_Titlecase_Mapping`
    pub const SimpleTitlecaseMapping: Self = UnicodeProperty(0x4008);
    /// String property `Simple_Uppercase_Mapping`
    pub const SimpleUppercaseMapping: Self = UnicodeProperty(0x4009);
    /// String property `Titlecase_Mapping`
    pub const TitlecaseMapping: Self = UnicodeProperty(0x400A);
    /// String property `Unicode_1_Name`
    pub const Unicode1_Name: Self = UnicodeProperty(0x400B);
    /// String property `Uppercase_Mapping`
    pub const UppercaseMapping: Self = UnicodeProperty(0x400C);
    /// String property `Bidi_Paired_Bracket`
    pub const BidiPairedBracket: Self = UnicodeProperty(0x400D);

    const STRING_MAX: Self = Self::BidiPairedBracket;

    /// Misc property `Script_Extensions`
    pub const ScriptExtensions: Self = UnicodeProperty(0x7000);
}

#[allow(unused)] // experimental, may be made public later
impl UnicodeProperty {
    /// Given a property name (long, short, or alias), returns the corresponding [`UnicodeProperty`]
    /// value for it provided it belongs to the [subset relevant for ECMA262 regexes][subset]
    ///
    /// Returns none if the name does not match any of the names in this subset. Performs
    /// strict matching of names.
    ///
    /// If using this to implement an ECMA262-compliant regex engine, please note these caveats:
    ///
    /// - This only returns binary and enumerated properties, as well as [`Self::ScriptExtensions`].
    ///   Lookup can be performed sufficiently with [`Self::load_ecma262_binary_property_unstable()`],
    ///   [`maps::load_general_category()`], [`maps::load_script()`]  and [`script::load_script_with_extensions_unstable()`].
    /// - This does not handle the `Any`, `Assigned`, or `ASCII` pseudoproperties, since they are not
    ///   defined as properties.
    ///    - `Any` can be expressed as the range `[\u{0}-\u{10FFFF}]`
    ///    - `Assigned` can be expressed as the inverse of the set `gc=Cn` (i.e., `\P{gc=Cn}`).
    ///    - `ASCII` can be expressed as the range `[\u{0}-\u{7F}]`
    /// - ECMA262 regexes transparently allow `General_Category_Mask` values for `GeneralCategory`.
    ///   This method does not return [`Self::GeneralCategoryMask`], and instead relies on the caller to use mask-related lookup
    ///   functions where necessary.
    /// - ECMA262 regexes allow treating `General_Category` (and `gcm`) values as binary properties,
    ///   e.g. you can do things like `\p{Lu}` as shortform for `\p{gc=Lu}`. This method does not do so
    ///   since these are property values, not properties, but you can use
    ///   [`GeneralCategory::get_name_to_enum_mapper()`] or  [`GeneralCategoryGroup::get_name_to_enum_mapper()`]
    ///   to handle this.
    ///
    ///
    /// [subset]: https://tc39.es/ecma262/#table-nonbinary-unicode-properties
    pub fn parse_ecma262_name(name: &str) -> Option<Self> {
        let prop = match name {
            "General_Category" | "gc" => Self::GeneralCategory,
            "Script" | "sc" => Self::Script,
            "Script_Extensions" | "scx" => Self::ScriptExtensions,
            "ASCII_Hex_Digit" | "AHex" => Self::AsciiHexDigit,
            "Alphabetic" | "Alpha" => Self::Alphabetic,
            "Bidi_Control" | "Bidi_C" => Self::BidiControl,
            "Bidi_Mirrored" | "Bidi_M" => Self::BidiMirrored,
            "Case_Ignorable" | "CI" => Self::CaseIgnorable,
            "Cased" => Self::Cased,
            "Changes_When_Casefolded" | "CWCF" => Self::ChangesWhenCasefolded,
            "Changes_When_Casemapped" | "CWCM" => Self::ChangesWhenCasemapped,
            "Changes_When_Lowercased" | "CWL" => Self::ChangesWhenLowercased,
            "Changes_When_NFKC_Casefolded" | "CWKCF" => Self::ChangesWhenNfkcCasefolded,
            "Changes_When_Titlecased" | "CWT" => Self::ChangesWhenTitlecased,
            "Changes_When_Uppercased" | "CWU" => Self::ChangesWhenUppercased,
            "Dash" => Self::Dash,
            "Default_Ignorable_Code_Point" | "DI" => Self::DefaultIgnorableCodePoint,
            "Deprecated" | "Dep" => Self::Deprecated,
            "Diacritic" | "Dia" => Self::Diacritic,
            "Emoji" => Self::Emoji,
            "Emoji_Component" | "EComp" => Self::EmojiComponent,
            "Emoji_Modifier" | "EMod" => Self::EmojiModifier,
            "Emoji_Modifier_Base" | "EBase" => Self::EmojiModifierBase,
            "Emoji_Presentation" | "EPres" => Self::EmojiPresentation,
            "Extended_Pictographic" | "ExtPict" => Self::ExtendedPictographic,
            "Extender" | "Ext" => Self::Extender,
            "Grapheme_Base" | "Gr_Base" => Self::GraphemeBase,
            "Grapheme_Extend" | "Gr_Ext" => Self::GraphemeExtend,
            "Hex_Digit" | "Hex" => Self::HexDigit,
            "IDS_Binary_Operator" | "IDSB" => Self::IdsBinaryOperator,
            "IDS_Trinary_Operator" | "IDST" => Self::IdsTrinaryOperator,
            "ID_Continue" | "IDC" => Self::IdContinue,
            "ID_Start" | "IDS" => Self::IdStart,
            "Ideographic" | "Ideo" => Self::Ideographic,
            "Join_Control" | "Join_C" => Self::JoinControl,
            "Logical_Order_Exception" | "LOE" => Self::LogicalOrderException,
            "Lowercase" | "Lower" => Self::Lowercase,
            "Math" => Self::Math,
            "Noncharacter_Code_Point" | "NChar" => Self::NoncharacterCodePoint,
            "Pattern_Syntax" | "Pat_Syn" => Self::PatternSyntax,
            "Pattern_White_Space" | "Pat_WS" => Self::PatternWhiteSpace,
            "Quotation_Mark" | "QMark" => Self::QuotationMark,
            "Radical" => Self::Radical,
            "Regional_Indicator" | "RI" => Self::RegionalIndicator,
            "Sentence_Terminal" | "STerm" => Self::SentenceTerminal,
            "Soft_Dotted" | "SD" => Self::SoftDotted,
            "Terminal_Punctuation" | "Term" => Self::TerminalPunctuation,
            "Unified_Ideograph" | "UIdeo" => Self::UnifiedIdeograph,
            "Uppercase" | "Upper" => Self::Uppercase,
            "Variation_Selector" | "VS" => Self::VariationSelector,
            "White_Space" | "space" => Self::WhiteSpace,
            "XID_Continue" | "XIDC" => Self::XidContinue,
            "XID_Start" | "XIDS" => Self::XidStart,
            _ => return None,
        };

        Some(prop)
    }
}