toml_edit/parser/
strings.rs

1use std::borrow::Cow;
2use std::char;
3use std::ops::RangeInclusive;
4
5use winnow::combinator::alt;
6use winnow::combinator::cut_err;
7use winnow::combinator::delimited;
8use winnow::combinator::fail;
9use winnow::combinator::opt;
10use winnow::combinator::peek;
11use winnow::combinator::preceded;
12use winnow::combinator::repeat;
13use winnow::combinator::success;
14use winnow::combinator::terminated;
15use winnow::prelude::*;
16use winnow::stream::Stream;
17use winnow::token::any;
18use winnow::token::none_of;
19use winnow::token::one_of;
20use winnow::token::tag;
21use winnow::token::take_while;
22use winnow::trace::trace;
23
24use crate::parser::errors::CustomError;
25use crate::parser::numbers::HEXDIG;
26use crate::parser::prelude::*;
27use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR};
28
29// ;; String
30
31// string = ml-basic-string / basic-string / ml-literal-string / literal-string
32pub(crate) fn string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
33    trace(
34        "string",
35        alt((
36            ml_basic_string,
37            basic_string,
38            ml_literal_string,
39            literal_string.map(Cow::Borrowed),
40        )),
41    )
42    .parse_next(input)
43}
44
45// ;; Basic String
46
47// basic-string = quotation-mark *basic-char quotation-mark
48pub(crate) fn basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
49    trace("basic-string", |input: &mut Input<'i>| {
50        let _ = one_of(QUOTATION_MARK).parse_next(input)?;
51
52        let mut c = Cow::Borrowed("");
53        if let Some(ci) = opt(basic_chars).parse_next(input)? {
54            c = ci;
55        }
56        while let Some(ci) = opt(basic_chars).parse_next(input)? {
57            c.to_mut().push_str(&ci);
58        }
59
60        let _ = cut_err(one_of(QUOTATION_MARK))
61            .context(StrContext::Label("basic string"))
62            .parse_next(input)?;
63
64        Ok(c)
65    })
66    .parse_next(input)
67}
68
69// quotation-mark = %x22            ; "
70pub(crate) const QUOTATION_MARK: u8 = b'"';
71
72// basic-char = basic-unescaped / escaped
73fn basic_chars<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
74    alt((
75        // Deviate from the official grammar by batching the unescaped chars so we build a string a
76        // chunk at a time, rather than a `char` at a time.
77        take_while(1.., BASIC_UNESCAPED)
78            .try_map(std::str::from_utf8)
79            .map(Cow::Borrowed),
80        escaped.map(|c| Cow::Owned(String::from(c))),
81    ))
82    .parse_next(input)
83}
84
85// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
86pub(crate) const BASIC_UNESCAPED: (
87    (u8, u8),
88    u8,
89    RangeInclusive<u8>,
90    RangeInclusive<u8>,
91    RangeInclusive<u8>,
92) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
93
94// escaped = escape escape-seq-char
95fn escaped(input: &mut Input<'_>) -> PResult<char> {
96    preceded(ESCAPE, escape_seq_char).parse_next(input)
97}
98
99// escape = %x5C                    ; \
100pub(crate) const ESCAPE: u8 = b'\\';
101
102// escape-seq-char =  %x22         ; "    quotation mark  U+0022
103// escape-seq-char =/ %x5C         ; \    reverse solidus U+005C
104// escape-seq-char =/ %x62         ; b    backspace       U+0008
105// escape-seq-char =/ %x66         ; f    form feed       U+000C
106// escape-seq-char =/ %x6E         ; n    line feed       U+000A
107// escape-seq-char =/ %x72         ; r    carriage return U+000D
108// escape-seq-char =/ %x74         ; t    tab             U+0009
109// escape-seq-char =/ %x75 4HEXDIG ; uXXXX                U+XXXX
110// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX            U+XXXXXXXX
111fn escape_seq_char(input: &mut Input<'_>) -> PResult<char> {
112    dispatch! {any;
113        b'b' => success('\u{8}'),
114        b'f' => success('\u{c}'),
115        b'n' => success('\n'),
116        b'r' => success('\r'),
117        b't' => success('\t'),
118        b'u' => cut_err(hexescape::<4>).context(StrContext::Label("unicode 4-digit hex code")),
119        b'U' => cut_err(hexescape::<8>).context(StrContext::Label("unicode 8-digit hex code")),
120        b'\\' => success('\\'),
121        b'"' => success('"'),
122        _ => {
123            cut_err(fail::<_, char, _>)
124            .context(StrContext::Label("escape sequence"))
125            .context(StrContext::Expected(StrContextValue::CharLiteral('b')))
126            .context(StrContext::Expected(StrContextValue::CharLiteral('f')))
127            .context(StrContext::Expected(StrContextValue::CharLiteral('n')))
128            .context(StrContext::Expected(StrContextValue::CharLiteral('r')))
129            .context(StrContext::Expected(StrContextValue::CharLiteral('t')))
130            .context(StrContext::Expected(StrContextValue::CharLiteral('u')))
131            .context(StrContext::Expected(StrContextValue::CharLiteral('U')))
132            .context(StrContext::Expected(StrContextValue::CharLiteral('\\')))
133            .context(StrContext::Expected(StrContextValue::CharLiteral('"')))
134        }
135    }
136    .parse_next(input)
137}
138
139pub(crate) fn hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char> {
140    take_while(0..=N, HEXDIG)
141        .verify(|b: &[u8]| b.len() == N)
142        .map(|b: &[u8]| unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") })
143        .verify_map(|s| u32::from_str_radix(s, 16).ok())
144        .try_map(|h| char::from_u32(h).ok_or(CustomError::OutOfRange))
145        .parse_next(input)
146}
147
148// ;; Multiline Basic String
149
150// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
151//                   ml-basic-string-delim
152fn ml_basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
153    trace(
154        "ml-basic-string",
155        delimited(
156            ML_BASIC_STRING_DELIM,
157            preceded(opt(newline), cut_err(ml_basic_body)),
158            cut_err(ML_BASIC_STRING_DELIM),
159        )
160        .context(StrContext::Label("multiline basic string")),
161    )
162    .parse_next(input)
163}
164
165// ml-basic-string-delim = 3quotation-mark
166pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b"\"\"\"";
167
168// ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
169fn ml_basic_body<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
170    let mut c = Cow::Borrowed("");
171    if let Some(ci) = opt(mlb_content).parse_next(input)? {
172        c = ci;
173    }
174    while let Some(ci) = opt(mlb_content).parse_next(input)? {
175        c.to_mut().push_str(&ci);
176    }
177
178    while let Some(qi) = opt(mlb_quotes(none_of(b'\"').value(()))).parse_next(input)? {
179        if let Some(ci) = opt(mlb_content).parse_next(input)? {
180            c.to_mut().push_str(qi);
181            c.to_mut().push_str(&ci);
182            while let Some(ci) = opt(mlb_content).parse_next(input)? {
183                c.to_mut().push_str(&ci);
184            }
185        } else {
186            break;
187        }
188    }
189
190    if let Some(qi) = opt(mlb_quotes(tag(ML_BASIC_STRING_DELIM).value(()))).parse_next(input)? {
191        c.to_mut().push_str(qi);
192    }
193
194    Ok(c)
195}
196
197// mlb-content = mlb-char / newline / mlb-escaped-nl
198// mlb-char = mlb-unescaped / escaped
199fn mlb_content<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
200    alt((
201        // Deviate from the official grammar by batching the unescaped chars so we build a string a
202        // chunk at a time, rather than a `char` at a time.
203        take_while(1.., MLB_UNESCAPED)
204            .try_map(std::str::from_utf8)
205            .map(Cow::Borrowed),
206        // Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences
207        mlb_escaped_nl.map(|_| Cow::Borrowed("")),
208        escaped.map(|c| Cow::Owned(String::from(c))),
209        newline.map(|_| Cow::Borrowed("\n")),
210    ))
211    .parse_next(input)
212}
213
214// mlb-quotes = 1*2quotation-mark
215fn mlb_quotes<'i>(
216    mut term: impl winnow::Parser<Input<'i>, (), ContextError>,
217) -> impl Parser<Input<'i>, &'i str, ContextError> {
218    move |input: &mut Input<'i>| {
219        let start = input.checkpoint();
220        let res = terminated(b"\"\"", peek(term.by_ref()))
221            .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
222            .parse_next(input);
223
224        match res {
225            Err(winnow::error::ErrMode::Backtrack(_)) => {
226                input.reset(start);
227                terminated(b"\"", peek(term.by_ref()))
228                    .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
229                    .parse_next(input)
230            }
231            res => res,
232        }
233    }
234}
235
236// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
237pub(crate) const MLB_UNESCAPED: (
238    (u8, u8),
239    u8,
240    RangeInclusive<u8>,
241    RangeInclusive<u8>,
242    RangeInclusive<u8>,
243) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
244
245// mlb-escaped-nl = escape ws newline *( wschar / newline
246// When the last non-whitespace character on a line is a \,
247// it will be trimmed along with all whitespace
248// (including newlines) up to the next non-whitespace
249// character or closing delimiter.
250fn mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()> {
251    repeat(1.., (ESCAPE, ws, ws_newlines))
252        .map(|()| ())
253        .value(())
254        .parse_next(input)
255}
256
257// ;; Literal String
258
259// literal-string = apostrophe *literal-char apostrophe
260pub(crate) fn literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
261    trace(
262        "literal-string",
263        delimited(
264            APOSTROPHE,
265            cut_err(take_while(0.., LITERAL_CHAR)),
266            cut_err(APOSTROPHE),
267        )
268        .try_map(std::str::from_utf8)
269        .context(StrContext::Label("literal string")),
270    )
271    .parse_next(input)
272}
273
274// apostrophe = %x27 ; ' apostrophe
275pub(crate) const APOSTROPHE: u8 = b'\'';
276
277// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
278pub(crate) const LITERAL_CHAR: (
279    u8,
280    RangeInclusive<u8>,
281    RangeInclusive<u8>,
282    RangeInclusive<u8>,
283) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
284
285// ;; Multiline Literal String
286
287// ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
288//                     ml-literal-string-delim
289fn ml_literal_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
290    trace(
291        "ml-literal-string",
292        delimited(
293            (ML_LITERAL_STRING_DELIM, opt(newline)),
294            cut_err(ml_literal_body.map(|t| {
295                if t.contains("\r\n") {
296                    Cow::Owned(t.replace("\r\n", "\n"))
297                } else {
298                    Cow::Borrowed(t)
299                }
300            })),
301            cut_err(ML_LITERAL_STRING_DELIM),
302        )
303        .context(StrContext::Label("multiline literal string")),
304    )
305    .parse_next(input)
306}
307
308// ml-literal-string-delim = 3apostrophe
309pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''";
310
311// ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
312fn ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
313    (
314        repeat(0.., mll_content).map(|()| ()),
315        repeat(
316            0..,
317            (
318                mll_quotes(none_of(APOSTROPHE).value(())),
319                repeat(1.., mll_content).map(|()| ()),
320            ),
321        )
322        .map(|()| ()),
323        opt(mll_quotes(tag(ML_LITERAL_STRING_DELIM).value(()))),
324    )
325        .recognize()
326        .try_map(std::str::from_utf8)
327        .parse_next(input)
328}
329
330// mll-content = mll-char / newline
331fn mll_content(input: &mut Input<'_>) -> PResult<u8> {
332    alt((one_of(MLL_CHAR), newline)).parse_next(input)
333}
334
335// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
336const MLL_CHAR: (
337    u8,
338    RangeInclusive<u8>,
339    RangeInclusive<u8>,
340    RangeInclusive<u8>,
341) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
342
343// mll-quotes = 1*2apostrophe
344fn mll_quotes<'i>(
345    mut term: impl winnow::Parser<Input<'i>, (), ContextError>,
346) -> impl Parser<Input<'i>, &'i str, ContextError> {
347    move |input: &mut Input<'i>| {
348        let start = input.checkpoint();
349        let res = terminated(b"''", peek(term.by_ref()))
350            .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
351            .parse_next(input);
352
353        match res {
354            Err(winnow::error::ErrMode::Backtrack(_)) => {
355                input.reset(start);
356                terminated(b"'", peek(term.by_ref()))
357                    .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
358                    .parse_next(input)
359            }
360            res => res,
361        }
362    }
363}
364
365#[cfg(test)]
366mod test {
367    use super::*;
368
369    #[test]
370    fn basic_string() {
371        let input =
372            r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""#;
373        let expected = "I\'m a string. \"You can quote me\". Name\tJosé\nLocation\tSF. \u{2070E}";
374        let parsed = string.parse(new_input(input));
375        assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
376    }
377
378    #[test]
379    fn ml_basic_string() {
380        let cases = [
381            (
382                r#""""
383Roses are red
384Violets are blue""""#,
385                r#"Roses are red
386Violets are blue"#,
387            ),
388            (r#"""" \""" """"#, " \"\"\" "),
389            (r#"""" \\""""#, " \\"),
390        ];
391
392        for &(input, expected) in &cases {
393            let parsed = string.parse(new_input(input));
394            assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
395        }
396
397        let invalid_cases = [r#""""  """#, r#""""  \""""#];
398
399        for input in &invalid_cases {
400            let parsed = string.parse(new_input(input));
401            assert!(parsed.is_err());
402        }
403    }
404
405    #[test]
406    fn ml_basic_string_escape_ws() {
407        let inputs = [
408            r#""""
409The quick brown \
410
411
412  fox jumps over \
413    the lazy dog.""""#,
414            r#""""\
415       The quick brown \
416       fox jumps over \
417       the lazy dog.\
418       """"#,
419        ];
420        for input in &inputs {
421            let expected = "The quick brown fox jumps over the lazy dog.";
422            let parsed = string.parse(new_input(input));
423            assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
424        }
425        let empties = [
426            r#""""\
427       """"#,
428            r#""""
429\
430  \
431""""#,
432        ];
433        for input in &empties {
434            let expected = "";
435            let parsed = string.parse(new_input(input));
436            assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
437        }
438    }
439
440    #[test]
441    fn literal_string() {
442        let inputs = [
443            r#"'C:\Users\nodejs\templates'"#,
444            r#"'\\ServerX\admin$\system32\'"#,
445            r#"'Tom "Dubs" Preston-Werner'"#,
446            r#"'<\i\c*\s*>'"#,
447        ];
448
449        for input in &inputs {
450            let expected = &input[1..input.len() - 1];
451            let parsed = string.parse(new_input(input));
452            assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
453        }
454    }
455
456    #[test]
457    fn ml_literal_string() {
458        let inputs = [
459            r#"'''I [dw]on't need \d{2} apples'''"#,
460            r#"''''one_quote''''"#,
461        ];
462        for input in &inputs {
463            let expected = &input[3..input.len() - 3];
464            let parsed = string.parse(new_input(input));
465            assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
466        }
467
468        let input = r#"'''
469The first newline is
470trimmed in raw strings.
471   All other whitespace
472   is preserved.
473'''"#;
474        let expected = &input[4..input.len() - 3];
475        let parsed = string.parse(new_input(input));
476        assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
477    }
478}