unic_langid_impl/
lib.rs

1mod errors;
2mod layout_table;
3#[cfg(feature = "likelysubtags")]
4pub mod likelysubtags;
5#[doc(hidden)]
6pub mod parser;
7#[cfg(feature = "serde")]
8mod serde;
9pub mod subtags;
10
11pub use crate::errors::LanguageIdentifierError;
12use std::fmt::Write;
13use std::iter::Peekable;
14use std::str::FromStr;
15
16/// Enum representing available character direction orientations.
17#[derive(Clone, Copy, Debug, PartialEq)]
18pub enum CharacterDirection {
19    /// Right To Left
20    ///
21    /// Used in languages such as Arabic, Hebrew, Fula, Kurdish etc.
22    RTL,
23    /// Left To Right
24    ///
25    /// Used in languages such as French, Spanish, English, German etc.
26    LTR,
27    /// Top To Bottom
28    ///
29    /// Used in Traditional Mongolian
30    TTB,
31}
32
33type PartsTuple = (
34    subtags::Language,
35    Option<subtags::Script>,
36    Option<subtags::Region>,
37    Vec<subtags::Variant>,
38);
39
40/// `LanguageIdentifier` is a core struct representing a Unicode Language Identifier.
41///
42/// # Examples
43///
44/// ```
45/// use unic_langid_impl::LanguageIdentifier;
46///
47/// let li: LanguageIdentifier = "en-US".parse()
48///     .expect("Failed to parse.");
49///
50/// assert_eq!(li.language, "en");
51/// assert_eq!(li.script, None);
52/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));
53/// assert_eq!(li.variants().len(), 0);
54/// ```
55///
56/// # Parsing
57///
58/// Unicode recognizes three levels of standard conformance for any language identifier:
59///
60///  * *well-formed* - syntactically correct
61///  * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
62///  * *canonical* - valid and no deprecated codes or structure.
63///
64/// At the moment parsing normalizes a well-formed language identifier converting
65/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
66///
67/// Any bogus subtags will cause the parsing to fail with an error.
68/// No subtag validation is performed.
69///
70/// # Examples:
71///
72/// ```
73/// use unic_langid_impl::LanguageIdentifier;
74///
75/// let li: LanguageIdentifier = "eN_latn_Us-Valencia".parse()
76///     .expect("Failed to parse.");
77///
78/// assert_eq!(li.language, "en");
79/// assert_eq!(li.script.as_ref().map(Into::into), Some("Latn"));
80/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));
81/// assert_eq!(li.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]);
82/// ```
83#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
84pub struct LanguageIdentifier {
85    pub language: subtags::Language,
86    pub script: Option<subtags::Script>,
87    pub region: Option<subtags::Region>,
88    variants: Option<Box<[subtags::Variant]>>,
89}
90
91impl LanguageIdentifier {
92    /// A constructor which takes a utf8 slice, parses it and
93    /// produces a well-formed `LanguageIdentifier`.
94    ///
95    /// # Examples
96    ///
97    /// ```
98    /// use unic_langid_impl::LanguageIdentifier;
99    ///
100    /// let li = LanguageIdentifier::from_bytes("en-US".as_bytes())
101    ///     .expect("Parsing failed.");
102    ///
103    /// assert_eq!(li.to_string(), "en-US");
104    /// ```
105    pub fn from_bytes(v: &[u8]) -> Result<Self, LanguageIdentifierError> {
106        Ok(parser::parse_language_identifier(v)?)
107    }
108
109    /// A constructor which takes optional subtags as `AsRef<[u8]>`, parses them and
110    /// produces a well-formed `LanguageIdentifier`.
111    ///
112    /// # Examples
113    ///
114    /// ```
115    /// use unic_langid_impl::LanguageIdentifier;
116    ///
117    /// let li = LanguageIdentifier::from_parts(
118    ///     "fr".parse().expect("Parsing failed."),
119    ///     None,
120    ///     Some("CA".parse().expect("Parsing failed.")),
121    ///     &[]
122    /// );
123    ///
124    /// assert_eq!(li.to_string(), "fr-CA");
125    /// ```
126    pub fn from_parts(
127        language: subtags::Language,
128        script: Option<subtags::Script>,
129        region: Option<subtags::Region>,
130        variants: &[subtags::Variant],
131    ) -> Self {
132        let variants = if !variants.is_empty() {
133            let mut v = variants.to_vec();
134            v.sort_unstable();
135            v.dedup();
136            Some(v.into_boxed_slice())
137        } else {
138            None
139        };
140
141        Self {
142            language,
143            script,
144            region,
145            variants,
146        }
147    }
148
149    /// # Unchecked
150    ///
151    /// This function accepts subtags expecting variants
152    /// to be deduplicated and ordered.
153    pub const fn from_raw_parts_unchecked(
154        language: subtags::Language,
155        script: Option<subtags::Script>,
156        region: Option<subtags::Region>,
157        variants: Option<Box<[subtags::Variant]>>,
158    ) -> Self {
159        Self {
160            language,
161            script,
162            region,
163            variants,
164        }
165    }
166
167    #[doc(hidden)]
168    /// This method is used by `unic-locale` to handle partial
169    /// subtag iterator.
170    ///
171    /// Not stable.
172    pub fn try_from_iter<'a>(
173        iter: &mut Peekable<impl Iterator<Item = &'a [u8]>>,
174        allow_extension: bool,
175    ) -> Result<LanguageIdentifier, LanguageIdentifierError> {
176        Ok(parser::parse_language_identifier_from_iter(
177            iter,
178            allow_extension,
179        )?)
180    }
181
182    /// Consumes `LanguageIdentifier` and produces raw internal representations
183    /// of all subtags in form of `u64`/`u32`.
184    ///
185    /// Primarily used for storing internal representation and restoring via
186    /// `from_raw_parts_unchecked`.
187    ///
188    /// # Examples
189    ///
190    /// ```
191    /// use unic_langid_impl::LanguageIdentifier;
192    /// use tinystr::{TinyStr8, TinyStr4};
193    ///
194    /// let li: LanguageIdentifier = "en-US".parse()
195    ///     .expect("Parsing failed.");
196    ///
197    /// let (lang, script, region, variants) = li.into_parts();
198    ///
199    /// // let li2 = LanguageIdentifier::from_raw_parts_unchecked(
200    /// //     lang.map(|l| unsafe { TinyStr8::new_unchecked(l) }),
201    /// //    script.map(|s| unsafe { TinyStr4::new_unchecked(s) }),
202    /// //    region.map(|r| unsafe { TinyStr4::new_unchecked(r) }),
203    /// //    variants.map(|v| v.into_iter().map(|v| unsafe { TinyStr8::new_unchecked(*v) }).collect()),
204    /// //);
205    ///
206    /// //assert_eq!(li2.to_string(), "en-US");
207    /// ```
208    pub fn into_parts(self) -> PartsTuple {
209        (
210            self.language,
211            self.script,
212            self.region,
213            self.variants.map_or_else(Vec::new, |v| v.to_vec()),
214        )
215    }
216
217    /// Compares a `LanguageIdentifier` to another `AsRef<LanguageIdentifier`
218    /// allowing for either side to use the missing fields as wildcards.
219    ///
220    /// This allows for matching between `en` (treated as `en-*-*-*`) and `en-US`.
221    ///
222    /// # Examples
223    ///
224    /// ```
225    /// use unic_langid_impl::LanguageIdentifier;
226    ///
227    /// let li1: LanguageIdentifier = "en".parse()
228    ///     .expect("Parsing failed.");
229    ///
230    /// let li2: LanguageIdentifier = "en-US".parse()
231    ///     .expect("Parsing failed.");
232    ///
233    /// assert_ne!(li1, li2); // "en" != "en-US"
234    /// assert_ne!(li1.to_string(), li2.to_string()); // "en" != "en-US"
235    ///
236    /// assert_eq!(li1.matches(&li2, false, false), false); // "en" != "en-US"
237    /// assert_eq!(li1.matches(&li2, true, false), true); // "en-*-*-*" == "en-US"
238    /// assert_eq!(li1.matches(&li2, false, true), false); // "en" != "en-*-US-*"
239    /// assert_eq!(li1.matches(&li2, true, true), true); // "en-*-*-*" == "en-*-US-*"
240    /// ```
241    pub fn matches<O: AsRef<Self>>(
242        &self,
243        other: &O,
244        self_as_range: bool,
245        other_as_range: bool,
246    ) -> bool {
247        let other = other.as_ref();
248        self.language
249            .matches(other.language, self_as_range, other_as_range)
250            && subtag_matches(&self.script, &other.script, self_as_range, other_as_range)
251            && subtag_matches(&self.region, &other.region, self_as_range, other_as_range)
252            && subtags_match(
253                &self.variants,
254                &other.variants,
255                self_as_range,
256                other_as_range,
257            )
258    }
259
260    /// Returns a vector of variants subtags of the `LanguageIdentifier`.
261    ///
262    /// # Examples
263    ///
264    /// ```
265    /// use unic_langid_impl::LanguageIdentifier;
266    ///
267    /// let li1: LanguageIdentifier = "ca-ES-valencia".parse()
268    ///     .expect("Parsing failed.");
269    ///
270    /// assert_eq!(li1.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]);
271    ///
272    /// let li2: LanguageIdentifier = "de".parse()
273    ///     .expect("Parsing failed.");
274    ///
275    /// assert_eq!(li2.variants().len(), 0);
276    /// ```
277    pub fn variants(&self) -> impl ExactSizeIterator<Item = &subtags::Variant> {
278        let variants: &[_] = match self.variants {
279            Some(ref v) => v,
280            None => &[],
281        };
282
283        variants.iter()
284    }
285
286    /// Sets variant subtags of the `LanguageIdentifier`.
287    ///
288    /// # Examples
289    ///
290    /// ```
291    /// use unic_langid_impl::LanguageIdentifier;
292    ///
293    /// let mut li: LanguageIdentifier = "ca-ES".parse()
294    ///     .expect("Parsing failed.");
295    ///
296    /// li.set_variants(&["valencia".parse().expect("Parsing failed.")]);
297    ///
298    /// assert_eq!(li.to_string(), "ca-ES-valencia");
299    /// ```
300    pub fn set_variants(&mut self, variants: &[subtags::Variant]) {
301        let mut v = variants.to_vec();
302
303        if v.is_empty() {
304            self.variants = None;
305        } else {
306            v.sort_unstable();
307            v.dedup();
308            self.variants = Some(v.into_boxed_slice());
309        }
310    }
311
312    /// Tests if a variant subtag is present in the `LanguageIdentifier`.
313    ///
314    /// # Examples
315    ///
316    /// ```
317    /// use unic_langid_impl::LanguageIdentifier;
318    ///
319    /// let mut li: LanguageIdentifier = "ca-ES-macos".parse()
320    ///     .expect("Parsing failed.");
321    ///
322    /// assert_eq!(li.has_variant("valencia".parse().unwrap()), false);
323    /// assert_eq!(li.has_variant("macos".parse().unwrap()), true);
324    /// ```
325    pub fn has_variant(&self, variant: subtags::Variant) -> bool {
326        if let Some(variants) = &self.variants {
327            variants.contains(&variant)
328        } else {
329            false
330        }
331    }
332
333    /// Clears variant subtags of the `LanguageIdentifier`.
334    ///
335    /// # Examples
336    ///
337    /// ```
338    /// use unic_langid_impl::LanguageIdentifier;
339    ///
340    /// let mut li: LanguageIdentifier = "ca-ES-valencia".parse()
341    ///     .expect("Parsing failed.");
342    ///
343    /// li.clear_variants();
344    ///
345    /// assert_eq!(li.to_string(), "ca-ES");
346    /// ```
347    pub fn clear_variants(&mut self) {
348        self.variants = None;
349    }
350
351    /// Extends the `LanguageIdentifier` adding likely subtags based
352    /// on tables provided by CLDR.
353    ///
354    /// # Examples
355    ///
356    /// ```
357    /// use unic_langid_impl::LanguageIdentifier;
358    ///
359    /// let mut li: LanguageIdentifier = "en-US".parse()
360    ///     .expect("Parsing failed.");
361    ///
362    /// assert_eq!(li.maximize(), true);
363    /// assert_eq!(li.to_string(), "en-Latn-US");
364    /// ```
365    #[cfg(feature = "likelysubtags")]
366    pub fn maximize(&mut self) -> bool {
367        if let Some(new_li) = likelysubtags::maximize(self.language, self.script, self.region) {
368            self.language = new_li.0;
369            self.script = new_li.1;
370            self.region = new_li.2;
371            true
372        } else {
373            false
374        }
375    }
376
377    /// Extends the `LanguageIdentifier` removing likely subtags based
378    /// on tables provided by CLDR.
379    ///
380    /// # Examples
381    ///
382    /// ```
383    /// use unic_langid_impl::LanguageIdentifier;
384    ///
385    /// let mut li: LanguageIdentifier = "en-Latn-US".parse()
386    ///     .expect("Parsing failed.");
387    ///
388    /// assert_eq!(li.minimize(), true);
389    /// assert_eq!(li.to_string(), "en");
390    /// ```
391    #[cfg(feature = "likelysubtags")]
392    pub fn minimize(&mut self) -> bool {
393        if let Some(new_li) = likelysubtags::minimize(self.language, self.script, self.region) {
394            self.language = new_li.0;
395            self.script = new_li.1;
396            self.region = new_li.2;
397            true
398        } else {
399            false
400        }
401    }
402
403    /// Returns character direction of the `LanguageIdentifier`.
404    ///
405    /// # Examples
406    ///
407    /// ```
408    /// use unic_langid_impl::{LanguageIdentifier, CharacterDirection};
409    ///
410    /// let li1: LanguageIdentifier = "es-AR".parse()
411    ///     .expect("Parsing failed.");
412    /// let li2: LanguageIdentifier = "fa".parse()
413    ///     .expect("Parsing failed.");
414    ///
415    /// assert_eq!(li1.character_direction(), CharacterDirection::LTR);
416    /// assert_eq!(li2.character_direction(), CharacterDirection::RTL);
417    /// ```
418    pub fn character_direction(&self) -> CharacterDirection {
419        match (self.language.into(), self.script) {
420            (_, Some(script))
421                if layout_table::SCRIPTS_CHARACTER_DIRECTION_LTR.contains(&script.into()) =>
422            {
423                CharacterDirection::LTR
424            }
425            (_, Some(script))
426                if layout_table::SCRIPTS_CHARACTER_DIRECTION_RTL.contains(&script.into()) =>
427            {
428                CharacterDirection::RTL
429            }
430            (_, Some(script))
431                if layout_table::SCRIPTS_CHARACTER_DIRECTION_TTB.contains(&script.into()) =>
432            {
433                CharacterDirection::TTB
434            }
435            (Some(lang), _) if layout_table::LANGS_CHARACTER_DIRECTION_RTL.contains(&lang) => {
436                #[cfg(feature = "likelysubtags")]
437                if let Some((_, Some(script), _)) =
438                    likelysubtags::maximize(self.language, None, self.region)
439                {
440                    if layout_table::SCRIPTS_CHARACTER_DIRECTION_LTR.contains(&script.into()) {
441                        return CharacterDirection::LTR;
442                    }
443                }
444                CharacterDirection::RTL
445            }
446            _ => CharacterDirection::LTR,
447        }
448    }
449}
450
451impl FromStr for LanguageIdentifier {
452    type Err = LanguageIdentifierError;
453
454    fn from_str(source: &str) -> Result<Self, Self::Err> {
455        Self::from_bytes(source.as_bytes())
456    }
457}
458
459impl AsRef<LanguageIdentifier> for LanguageIdentifier {
460    #[inline(always)]
461    fn as_ref(&self) -> &LanguageIdentifier {
462        self
463    }
464}
465
466impl std::fmt::Display for LanguageIdentifier {
467    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
468        self.language.fmt(f)?;
469        if let Some(ref script) = self.script {
470            f.write_char('-')?;
471            script.fmt(f)?;
472        }
473        if let Some(ref region) = self.region {
474            f.write_char('-')?;
475            region.fmt(f)?;
476        }
477        if let Some(variants) = &self.variants {
478            for variant in variants.iter() {
479                f.write_char('-')?;
480                variant.fmt(f)?;
481            }
482        }
483        Ok(())
484    }
485}
486
487impl PartialEq<&str> for LanguageIdentifier {
488    fn eq(&self, other: &&str) -> bool {
489        self.to_string().as_str() == *other
490    }
491}
492
493fn subtag_matches<P: PartialEq>(
494    subtag1: &Option<P>,
495    subtag2: &Option<P>,
496    as_range1: bool,
497    as_range2: bool,
498) -> bool {
499    (as_range1 && subtag1.is_none()) || (as_range2 && subtag2.is_none()) || subtag1 == subtag2
500}
501
502fn is_option_empty<P: PartialEq>(subtag: &Option<Box<[P]>>) -> bool {
503    subtag.as_ref().is_none_or(|t| t.is_empty())
504}
505
506fn subtags_match<P: PartialEq>(
507    subtag1: &Option<Box<[P]>>,
508    subtag2: &Option<Box<[P]>>,
509    as_range1: bool,
510    as_range2: bool,
511) -> bool {
512    // or is some and is empty!
513    (as_range1 && is_option_empty(subtag1))
514        || (as_range2 && is_option_empty(subtag2))
515        || subtag1 == subtag2
516}
517
518/// This is a best-effort operation that performs all available levels of canonicalization.
519///
520/// At the moment the operation will normalize casing and the separator, but in the future
521/// it may also validate and update from deprecated subtags to canonical ones.
522///
523/// # Examples
524///
525/// ```
526/// use unic_langid_impl::canonicalize;
527///
528/// assert_eq!(canonicalize("pL_latn_pl"), Ok("pl-Latn-PL".to_string()));
529/// ```
530pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, LanguageIdentifierError> {
531    let lang_id = LanguageIdentifier::from_bytes(input.as_ref())?;
532    Ok(lang_id.to_string())
533}
534
535#[test]
536fn invalid_subtag() {
537    assert!(LanguageIdentifier::from_bytes("en-ÁÁÁÁ".as_bytes()).is_err());
538}