sys_locale/
unix.rs

1use std::{env, ffi::OsStr};
2
3const LANGUAGE: &str = "LANGUAGE";
4const LC_ALL: &str = "LC_ALL";
5const LC_MESSAGES: &str = "LC_MESSAGES";
6const LANG: &str = "LANG";
7
8/// Environment variable access abstraction to allow testing without
9/// mutating env variables.
10///
11/// Use [StdEnv] to query [std::env]
12trait EnvAccess {
13    /// See also [std::env::var]
14    fn get(&self, key: impl AsRef<OsStr>) -> Option<String>;
15}
16
17/// Proxy to [std::env]
18struct StdEnv;
19impl EnvAccess for StdEnv {
20    fn get(&self, key: impl AsRef<OsStr>) -> Option<String> {
21        env::var(key).ok()
22    }
23}
24
25pub(crate) fn get() -> impl Iterator<Item = String> {
26    _get(&StdEnv)
27}
28
29/// Retrieves a list of unique locales by checking specific environment variables
30/// in a predefined order: LANGUAGE, LC_ALL, LC_MESSAGES, and LANG.
31///
32/// The function first checks the `LANGUAGE` environment variable, which can contain
33/// one or more locales separated by a colon (`:`). It then splits these values,
34/// converts them from [POSIX](https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/V1_chap08.html)
35/// to [BCP 47](https://www.ietf.org/rfc/bcp/bcp47.html) format, and adds them to the list of locales
36/// if they are not already included.
37///
38/// Next, the function checks the `LC_ALL`, `LC_MESSAGES`, and `LANG` environment
39/// variables. Each of these variables contains a single locale. If a locale is found,
40/// and it's not empty, it is converted to BCP 47 format and added to the list if
41/// it is not already included.
42///
43/// For more information check this issue: https://github.com/1Password/sys-locale/issues/14.
44///
45/// The function ensures that locales are returned in the order of precedence
46/// and without duplicates. The final list of locales is returned as an iterator.
47///
48/// # Returns
49///
50/// An iterator over the unique locales found in the environment variables.
51///
52/// # Environment Variables Checked
53///
54/// 1. `LANGUAGE` - Can contain multiple locales, each separated by a colon (`:`), highest priority.
55/// 2. `LC_ALL` - Contains a single locale, high priority.
56/// 3. `LC_MESSAGES` - Contains a single locale, medium priority.
57/// 4. `LANG` - Contains a single locale, low priority.
58///
59/// # Example
60///
61/// ```ignore
62/// let locales: Vec<String> = _get(&env).collect();
63/// for locale in locales {
64///     println!("User's preferred locales: {}", locale);
65/// }
66/// ```
67fn _get(env: &impl EnvAccess) -> impl Iterator<Item = String> {
68    let mut locales = Vec::new();
69
70    // LANGUAGE contains one or multiple locales separated by colon (':')
71    if let Some(val) = env.get(LANGUAGE).filter(|val| !val.is_empty()) {
72        for part in val.split(':') {
73            let locale = posix_to_bcp47(part);
74            if !locales.contains(&locale) {
75                locales.push(locale);
76            }
77        }
78    }
79
80    // LC_ALL, LC_MESSAGES and LANG contain one locale
81    for variable in [LC_ALL, LC_MESSAGES, LANG] {
82        if let Some(val) = env.get(variable).filter(|val| !val.is_empty()) {
83            let locale = posix_to_bcp47(&val);
84            if !locales.contains(&locale) {
85                locales.push(locale);
86            }
87        }
88    }
89
90    locales.into_iter()
91}
92
93/// Converts a POSIX locale string to a BCP 47 locale string.
94///
95/// This function processes the input `code` by removing any character encoding
96/// (the part after the `.` character) and any modifiers (the part after the `@` character).
97/// It replaces underscores (`_`) with hyphens (`-`) to conform to BCP 47 formatting.
98///
99/// If the locale is already in the BCP 47 format, no changes are made.
100///
101/// Useful links:
102/// - [The Open Group Base Specifications Issue 8 - 7. Locale](https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/V1_chap07.html)
103/// - [The Open Group Base Specifications Issue 8 - 8. Environment Variables](https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/V1_chap08.html)
104/// - [BCP 47 specification](https://www.ietf.org/rfc/bcp/bcp47.html)
105///
106/// # Examples
107///
108/// ```ignore
109/// let bcp47 = posix_to_bcp47("en-US"); // already BCP 47
110/// assert_eq!(bcp47, "en-US"); // no changes
111///
112/// let bcp47 = posix_to_bcp47("en_US");
113/// assert_eq!(bcp47, "en-US");
114///
115/// let bcp47 = posix_to_bcp47("ru_RU.UTF-8");
116/// assert_eq!(bcp47, "ru-RU");
117///
118/// let bcp47 = posix_to_bcp47("fr_FR@dict");
119/// assert_eq!(bcp47, "fr-FR");
120///
121/// let bcp47 = posix_to_bcp47("de_DE.UTF-8@euro");
122/// assert_eq!(bcp47, "de-DE");
123/// ```
124///
125/// # TODO
126///
127/// 1. Implement POSIX to BCP 47 modifier conversion (see https://github.com/1Password/sys-locale/issues/32).
128/// 2. Optimize to avoid creating a new buffer (see https://github.com/1Password/sys-locale/pull/33).
129fn posix_to_bcp47(locale: &str) -> String {
130    locale
131        .chars()
132        .take_while(|&c| c != '.' && c != '@')
133        .map(|c| if c == '_' { '-' } else { c })
134        .collect()
135}
136
137#[cfg(test)]
138mod tests {
139    use super::{EnvAccess, _get, posix_to_bcp47, LANG, LANGUAGE, LC_ALL, LC_MESSAGES};
140    use std::{
141        collections::HashMap,
142        ffi::{OsStr, OsString},
143    };
144
145    type MockEnv = HashMap<OsString, String>;
146    impl EnvAccess for MockEnv {
147        fn get(&self, key: impl AsRef<OsStr>) -> Option<String> {
148            self.get(key.as_ref()).cloned()
149        }
150    }
151
152    const BCP_47: &str = "fr-FR";
153    const POSIX: &str = "fr_FR";
154    const POSIX_ENC: &str = "fr_FR.UTF-8";
155    const POSIX_MOD: &str = "fr_FR@euro";
156    const POSIX_ENC_MOD: &str = "fr_FR.UTF-8@euro";
157
158    #[test]
159    fn parse_identifier() {
160        assert_eq!(posix_to_bcp47(BCP_47), BCP_47);
161        assert_eq!(posix_to_bcp47(POSIX), BCP_47);
162        assert_eq!(posix_to_bcp47(POSIX_ENC), BCP_47);
163        assert_eq!(posix_to_bcp47(POSIX_MOD), BCP_47);
164        assert_eq!(posix_to_bcp47(POSIX_ENC_MOD), BCP_47);
165    }
166
167    #[test]
168    fn env_get() {
169        fn case(
170            env: &mut MockEnv,
171            language: impl Into<String>,
172            lc_all: impl Into<String>,
173            lc_messages: impl Into<String>,
174            lang: impl Into<String>,
175            expected: impl IntoIterator<Item = impl Into<String>>,
176        ) {
177            env.insert(LANGUAGE.into(), language.into());
178            env.insert(LC_ALL.into(), lc_all.into());
179            env.insert(LC_MESSAGES.into(), lc_messages.into());
180            env.insert(LANG.into(), lang.into());
181            assert!(_get(env).eq(expected.into_iter().map(|s| s.into())));
182        }
183
184        let mut env = MockEnv::new();
185        assert_eq!(_get(&env).next(), None);
186
187        // Empty
188        case(&mut env, "", "", "", "", &[] as &[String]);
189
190        // Constants
191        case(
192            &mut env,
193            POSIX_ENC_MOD,
194            POSIX_ENC,
195            POSIX_MOD,
196            POSIX,
197            [BCP_47],
198        );
199
200        // Only one variable
201        case(&mut env, "en_US", "", "", "", ["en-US"]);
202        case(&mut env, "", "en_US", "", "", ["en-US"]);
203        case(&mut env, "", "", "en_US", "", ["en-US"]);
204        case(&mut env, "", "", "", "en_US", ["en-US"]);
205
206        // Duplicates
207        case(&mut env, "en_US", "en_US", "en_US", "en_US", ["en-US"]);
208        case(
209            &mut env,
210            "en_US",
211            "en_US",
212            "ru_RU",
213            "en_US",
214            ["en-US", "ru-RU"],
215        );
216        case(
217            &mut env,
218            "en_US",
219            "ru_RU",
220            "ru_RU",
221            "en_US",
222            ["en-US", "ru-RU"],
223        );
224        case(
225            &mut env,
226            "en_US",
227            "es_ES",
228            "ru_RU",
229            "en_US",
230            ["en-US", "es-ES", "ru-RU"],
231        );
232        case(
233            &mut env,
234            "en_US:ru_RU:es_ES:en_US",
235            "es_ES",
236            "ru_RU",
237            "en_US",
238            ["en-US", "ru-RU", "es-ES"],
239        );
240
241        // Duplicates with different case
242        case(
243            &mut env,
244            "en_US:fr_fr",
245            "EN_US",
246            "fR_Fr",
247            "En_US",
248            ["en-US", "fr-fr", "EN-US", "fR-Fr", "En-US"],
249        );
250
251        // More complicated cases
252        case(
253            &mut env,
254            "ru_RU:ru:en_US:en",
255            "ru_RU.UTF-8",
256            "ru_RU.UTF-8",
257            "ru_RU.UTF-8",
258            ["ru-RU", "ru", "en-US", "en"],
259        );
260        case(
261            &mut env,
262            "fr_FR.UTF-8@euro:fr_FR.UTF-8:fr_FR:fr:en_US.UTF-8:en_US:en",
263            "es_ES.UTF-8@euro",
264            "fr_FR.UTF-8@euro",
265            "fr_FR.UTF-8@euro",
266            ["fr-FR", "fr", "en-US", "en", "es-ES"],
267        );
268        case(
269            &mut env,
270            "",
271            "es_ES.UTF-8@euro",
272            "fr_FR.UTF-8@euro",
273            "fr_FR.UTF-8@euro",
274            ["es-ES", "fr-FR"],
275        );
276        case(
277            &mut env,
278            "fr_FR@euro",
279            "fr_FR.UTF-8",
280            "en_US.UTF-8",
281            "en_US.UTF-8@dict",
282            ["fr-FR", "en-US"],
283        );
284
285        // Already BCP 47
286        case(&mut env, BCP_47, BCP_47, BCP_47, POSIX, [BCP_47]);
287        case(
288            &mut env,
289            "fr-FR",
290            "es-ES",
291            "de-DE",
292            "en-US",
293            ["fr-FR", "es-ES", "de-DE", "en-US"],
294        );
295    }
296}