toml_edit/parser/
trivia.rs

1use std::ops::RangeInclusive;
2
3use winnow::combinator::alt;
4use winnow::combinator::eof;
5use winnow::combinator::opt;
6use winnow::combinator::repeat;
7use winnow::combinator::terminated;
8use winnow::prelude::*;
9use winnow::token::one_of;
10use winnow::token::take_while;
11
12use crate::parser::prelude::*;
13
14pub(crate) unsafe fn from_utf8_unchecked<'b>(
15    bytes: &'b [u8],
16    safety_justification: &'static str,
17) -> &'b str {
18    if cfg!(debug_assertions) {
19        // Catch problems more quickly when testing
20        std::str::from_utf8(bytes).expect(safety_justification)
21    } else {
22        std::str::from_utf8_unchecked(bytes)
23    }
24}
25
26// wschar = ( %x20 /              ; Space
27//            %x09 )              ; Horizontal tab
28pub(crate) const WSCHAR: (u8, u8) = (b' ', b'\t');
29
30// ws = *wschar
31pub(crate) fn ws<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
32    take_while(0.., WSCHAR)
33        .map(|b| unsafe { from_utf8_unchecked(b, "`is_wschar` filters out on-ASCII") })
34        .parse_next(input)
35}
36
37// non-ascii = %x80-D7FF / %xE000-10FFFF
38// - ASCII is 0xxxxxxx
39// - First byte for UTF-8 is 11xxxxxx
40// - Subsequent UTF-8 bytes are 10xxxxxx
41pub(crate) const NON_ASCII: RangeInclusive<u8> = 0x80..=0xff;
42
43// non-eol = %x09 / %x20-7E / non-ascii
44pub(crate) const NON_EOL: (u8, RangeInclusive<u8>, RangeInclusive<u8>) =
45    (0x09, 0x20..=0x7E, NON_ASCII);
46
47// comment-start-symbol = %x23 ; #
48pub(crate) const COMMENT_START_SYMBOL: u8 = b'#';
49
50// comment = comment-start-symbol *non-eol
51pub(crate) fn comment<'i>(input: &mut Input<'i>) -> PResult<&'i [u8]> {
52    (COMMENT_START_SYMBOL, take_while(0.., NON_EOL))
53        .recognize()
54        .parse_next(input)
55}
56
57// newline = ( %x0A /              ; LF
58//             %x0D.0A )           ; CRLF
59pub(crate) fn newline(input: &mut Input<'_>) -> PResult<u8> {
60    alt((
61        one_of(LF).value(b'\n'),
62        (one_of(CR), one_of(LF)).value(b'\n'),
63    ))
64    .parse_next(input)
65}
66pub(crate) const LF: u8 = b'\n';
67pub(crate) const CR: u8 = b'\r';
68
69// ws-newline       = *( wschar / newline )
70pub(crate) fn ws_newline<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
71    repeat(
72        0..,
73        alt((newline.value(&b"\n"[..]), take_while(1.., WSCHAR))),
74    )
75    .map(|()| ())
76    .recognize()
77    .map(|b| unsafe { from_utf8_unchecked(b, "`is_wschar` and `newline` filters out on-ASCII") })
78    .parse_next(input)
79}
80
81// ws-newlines      = newline *( wschar / newline )
82pub(crate) fn ws_newlines<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
83    (newline, ws_newline)
84        .recognize()
85        .map(|b| unsafe {
86            from_utf8_unchecked(b, "`is_wschar` and `newline` filters out on-ASCII")
87        })
88        .parse_next(input)
89}
90
91// note: this rule is not present in the original grammar
92// ws-comment-newline = *( ws-newline-nonempty / comment )
93pub(crate) fn ws_comment_newline<'i>(input: &mut Input<'i>) -> PResult<&'i [u8]> {
94    repeat(
95        0..,
96        alt((
97            repeat(
98                1..,
99                alt((take_while(1.., WSCHAR), newline.value(&b"\n"[..]))),
100            )
101            .map(|()| ()),
102            comment.value(()),
103        )),
104    )
105    .map(|()| ())
106    .recognize()
107    .parse_next(input)
108}
109
110// note: this rule is not present in the original grammar
111// line-ending = newline / eof
112pub(crate) fn line_ending<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
113    alt((newline.value("\n"), eof.value(""))).parse_next(input)
114}
115
116// note: this rule is not present in the original grammar
117// line-trailing = ws [comment] skip-line-ending
118pub(crate) fn line_trailing(input: &mut Input<'_>) -> PResult<std::ops::Range<usize>> {
119    terminated((ws, opt(comment)).span(), line_ending).parse_next(input)
120}
121
122#[cfg(test)]
123mod test {
124    use super::*;
125
126    #[test]
127    fn trivia() {
128        let inputs = [
129            "",
130            r#" "#,
131            r#"
132"#,
133            r#"
134# comment
135
136# comment2
137
138
139"#,
140            r#"
141        "#,
142            r#"# comment
143# comment2
144
145
146   "#,
147        ];
148        for input in inputs {
149            dbg!(input);
150            let parsed = ws_comment_newline.parse(new_input(input));
151            assert!(parsed.is_ok(), "{:?}", parsed);
152            let parsed = parsed.unwrap();
153            assert_eq!(parsed, input.as_bytes());
154        }
155    }
156}