toml_edit/parser/
strings.rs

1use std::borrow::Cow;
2use std::char;
3use std::ops::RangeInclusive;
4
5use winnow::combinator::alt;
6use winnow::combinator::cut_err;
7use winnow::combinator::delimited;
8use winnow::combinator::empty;
9use winnow::combinator::fail;
10use winnow::combinator::opt;
11use winnow::combinator::peek;
12use winnow::combinator::preceded;
13use winnow::combinator::repeat;
14use winnow::combinator::terminated;
15use winnow::combinator::trace;
16use winnow::prelude::*;
17use winnow::stream::Stream;
18use winnow::token::any;
19use winnow::token::none_of;
20use winnow::token::one_of;
21use winnow::token::take_while;
22
23use crate::parser::error::CustomError;
24use crate::parser::numbers::HEXDIG;
25use crate::parser::prelude::*;
26use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR};
27
28// ;; String
29
30// string = ml-basic-string / basic-string / ml-literal-string / literal-string
31pub(crate) fn string<'i>(input: &mut Input<'i>) -> ModalResult<Cow<'i, str>> {
32    trace(
33        "string",
34        alt((
35            ml_basic_string,
36            basic_string,
37            ml_literal_string,
38            literal_string.map(Cow::Borrowed),
39        )),
40    )
41    .parse_next(input)
42}
43
44// ;; Basic String
45
46// basic-string = quotation-mark *basic-char quotation-mark
47pub(crate) fn basic_string<'i>(input: &mut Input<'i>) -> ModalResult<Cow<'i, str>> {
48    trace("basic-string", |input: &mut Input<'i>| {
49        let _ = one_of(QUOTATION_MARK).parse_next(input)?;
50
51        let mut c = Cow::Borrowed("");
52        if let Some(ci) = opt(basic_chars).parse_next(input)? {
53            c = ci;
54        }
55        while let Some(ci) = opt(basic_chars).parse_next(input)? {
56            c.to_mut().push_str(&ci);
57        }
58
59        let _ = cut_err(one_of(QUOTATION_MARK))
60            .context(StrContext::Label("basic string"))
61            .parse_next(input)?;
62
63        Ok(c)
64    })
65    .parse_next(input)
66}
67
68// quotation-mark = %x22            ; "
69pub(crate) const QUOTATION_MARK: u8 = b'"';
70
71// basic-char = basic-unescaped / escaped
72fn basic_chars<'i>(input: &mut Input<'i>) -> ModalResult<Cow<'i, str>> {
73    alt((
74        // Deviate from the official grammar by batching the unescaped chars so we build a string a
75        // chunk at a time, rather than a `char` at a time.
76        take_while(1.., BASIC_UNESCAPED)
77            .try_map(std::str::from_utf8)
78            .map(Cow::Borrowed),
79        escaped.map(|c| Cow::Owned(String::from(c))),
80    ))
81    .parse_next(input)
82}
83
84// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
85pub(crate) const BASIC_UNESCAPED: (
86    (u8, u8),
87    u8,
88    RangeInclusive<u8>,
89    RangeInclusive<u8>,
90    RangeInclusive<u8>,
91) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
92
93// escaped = escape escape-seq-char
94fn escaped(input: &mut Input<'_>) -> ModalResult<char> {
95    preceded(ESCAPE, escape_seq_char).parse_next(input)
96}
97
98// escape = %x5C                    ; \
99pub(crate) const ESCAPE: u8 = b'\\';
100
101// escape-seq-char =  %x22         ; "    quotation mark  U+0022
102// escape-seq-char =/ %x5C         ; \    reverse solidus U+005C
103// escape-seq-char =/ %x62         ; b    backspace       U+0008
104// escape-seq-char =/ %x66         ; f    form feed       U+000C
105// escape-seq-char =/ %x6E         ; n    line feed       U+000A
106// escape-seq-char =/ %x72         ; r    carriage return U+000D
107// escape-seq-char =/ %x74         ; t    tab             U+0009
108// escape-seq-char =/ %x75 4HEXDIG ; uXXXX                U+XXXX
109// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX            U+XXXXXXXX
110fn escape_seq_char(input: &mut Input<'_>) -> ModalResult<char> {
111    dispatch! {any;
112        b'b' => empty.value('\u{8}'),
113        b'f' => empty.value('\u{c}'),
114        b'n' => empty.value('\n'),
115        b'r' => empty.value('\r'),
116        b't' => empty.value('\t'),
117        b'u' => cut_err(hexescape::<4>).context(StrContext::Label("unicode 4-digit hex code")),
118        b'U' => cut_err(hexescape::<8>).context(StrContext::Label("unicode 8-digit hex code")),
119        b'\\' => empty.value('\\'),
120        b'"' => empty.value('"'),
121        _ => {
122            cut_err(fail::<_, char, _>)
123            .context(StrContext::Label("escape sequence"))
124            .context(StrContext::Expected(StrContextValue::CharLiteral('b')))
125            .context(StrContext::Expected(StrContextValue::CharLiteral('f')))
126            .context(StrContext::Expected(StrContextValue::CharLiteral('n')))
127            .context(StrContext::Expected(StrContextValue::CharLiteral('r')))
128            .context(StrContext::Expected(StrContextValue::CharLiteral('t')))
129            .context(StrContext::Expected(StrContextValue::CharLiteral('u')))
130            .context(StrContext::Expected(StrContextValue::CharLiteral('U')))
131            .context(StrContext::Expected(StrContextValue::CharLiteral('\\')))
132            .context(StrContext::Expected(StrContextValue::CharLiteral('"')))
133        }
134    }
135    .parse_next(input)
136}
137
138pub(crate) fn hexescape<const N: usize>(input: &mut Input<'_>) -> ModalResult<char> {
139    take_while(0..=N, HEXDIG)
140        .verify(|b: &[u8]| b.len() == N)
141        .map(|b: &[u8]| unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") })
142        .verify_map(|s| u32::from_str_radix(s, 16).ok())
143        .try_map(|h| char::from_u32(h).ok_or(CustomError::OutOfRange))
144        .parse_next(input)
145}
146
147// ;; Multiline Basic String
148
149// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
150//                   ml-basic-string-delim
151fn ml_basic_string<'i>(input: &mut Input<'i>) -> ModalResult<Cow<'i, str>> {
152    trace(
153        "ml-basic-string",
154        delimited(
155            ML_BASIC_STRING_DELIM,
156            preceded(opt(newline), cut_err(ml_basic_body))
157                .context(StrContext::Label("multiline basic string")),
158            cut_err(ML_BASIC_STRING_DELIM).context(StrContext::Label("multiline basic string")),
159        ),
160    )
161    .parse_next(input)
162}
163
164// ml-basic-string-delim = 3quotation-mark
165pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b"\"\"\"";
166
167// ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
168fn ml_basic_body<'i>(input: &mut Input<'i>) -> ModalResult<Cow<'i, str>> {
169    let mut c = Cow::Borrowed("");
170    if let Some(ci) = opt(mlb_content).parse_next(input)? {
171        c = ci;
172    }
173    while let Some(ci) = opt(mlb_content).parse_next(input)? {
174        c.to_mut().push_str(&ci);
175    }
176
177    while let Some(qi) = opt(mlb_quotes(none_of(b'\"').value(()))).parse_next(input)? {
178        if let Some(ci) = opt(mlb_content).parse_next(input)? {
179            c.to_mut().push_str(qi);
180            c.to_mut().push_str(&ci);
181            while let Some(ci) = opt(mlb_content).parse_next(input)? {
182                c.to_mut().push_str(&ci);
183            }
184        } else {
185            break;
186        }
187    }
188
189    if let Some(qi) = opt(mlb_quotes(ML_BASIC_STRING_DELIM.void())).parse_next(input)? {
190        c.to_mut().push_str(qi);
191    }
192
193    Ok(c)
194}
195
196// mlb-content = mlb-char / newline / mlb-escaped-nl
197// mlb-char = mlb-unescaped / escaped
198fn mlb_content<'i>(input: &mut Input<'i>) -> ModalResult<Cow<'i, str>> {
199    alt((
200        // Deviate from the official grammar by batching the unescaped chars so we build a string a
201        // chunk at a time, rather than a `char` at a time.
202        take_while(1.., MLB_UNESCAPED)
203            .try_map(std::str::from_utf8)
204            .map(Cow::Borrowed),
205        // Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences
206        mlb_escaped_nl.map(|_| Cow::Borrowed("")),
207        escaped.map(|c| Cow::Owned(String::from(c))),
208        newline.map(|_| Cow::Borrowed("\n")),
209    ))
210    .parse_next(input)
211}
212
213// mlb-quotes = 1*2quotation-mark
214fn mlb_quotes<'i>(
215    mut term: impl ModalParser<Input<'i>, (), ContextError>,
216) -> impl ModalParser<Input<'i>, &'i str, ContextError> {
217    move |input: &mut Input<'i>| {
218        let start = input.checkpoint();
219        let res = terminated(b"\"\"", peek(term.by_ref()))
220            .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
221            .parse_next(input);
222
223        match res {
224            Err(winnow::error::ErrMode::Backtrack(_)) => {
225                input.reset(&start);
226                terminated(b"\"", peek(term.by_ref()))
227                    .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
228                    .parse_next(input)
229            }
230            res => res,
231        }
232    }
233}
234
235// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
236pub(crate) const MLB_UNESCAPED: (
237    (u8, u8),
238    u8,
239    RangeInclusive<u8>,
240    RangeInclusive<u8>,
241    RangeInclusive<u8>,
242) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
243
244// mlb-escaped-nl = escape ws newline *( wschar / newline
245// When the last non-whitespace character on a line is a \,
246// it will be trimmed along with all whitespace
247// (including newlines) up to the next non-whitespace
248// character or closing delimiter.
249fn mlb_escaped_nl(input: &mut Input<'_>) -> ModalResult<()> {
250    repeat(1.., (ESCAPE, ws, ws_newlines))
251        .map(|()| ())
252        .value(())
253        .parse_next(input)
254}
255
256// ;; Literal String
257
258// literal-string = apostrophe *literal-char apostrophe
259pub(crate) fn literal_string<'i>(input: &mut Input<'i>) -> ModalResult<&'i str> {
260    trace(
261        "literal-string",
262        delimited(
263            APOSTROPHE,
264            cut_err(take_while(0.., LITERAL_CHAR)),
265            cut_err(APOSTROPHE),
266        )
267        .try_map(std::str::from_utf8)
268        .context(StrContext::Label("literal string")),
269    )
270    .parse_next(input)
271}
272
273// apostrophe = %x27 ; ' apostrophe
274pub(crate) const APOSTROPHE: u8 = b'\'';
275
276// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
277pub(crate) const LITERAL_CHAR: (
278    u8,
279    RangeInclusive<u8>,
280    RangeInclusive<u8>,
281    RangeInclusive<u8>,
282) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
283
284// ;; Multiline Literal String
285
286// ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
287//                     ml-literal-string-delim
288fn ml_literal_string<'i>(input: &mut Input<'i>) -> ModalResult<Cow<'i, str>> {
289    trace(
290        "ml-literal-string",
291        delimited(
292            (ML_LITERAL_STRING_DELIM, opt(newline)),
293            cut_err(ml_literal_body.map(|t| {
294                if t.contains("\r\n") {
295                    Cow::Owned(t.replace("\r\n", "\n"))
296                } else {
297                    Cow::Borrowed(t)
298                }
299            }))
300            .context(StrContext::Label("multiline literal string")),
301            cut_err(ML_LITERAL_STRING_DELIM).context(StrContext::Label("multiline literal string")),
302        ),
303    )
304    .parse_next(input)
305}
306
307// ml-literal-string-delim = 3apostrophe
308pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''";
309
310// ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
311fn ml_literal_body<'i>(input: &mut Input<'i>) -> ModalResult<&'i str> {
312    (
313        repeat(0.., mll_content).map(|()| ()),
314        repeat(
315            0..,
316            (
317                mll_quotes(none_of(APOSTROPHE).value(())),
318                repeat(1.., mll_content).map(|()| ()),
319            ),
320        )
321        .map(|()| ()),
322        opt(mll_quotes(ML_LITERAL_STRING_DELIM.void())),
323    )
324        .take()
325        .try_map(std::str::from_utf8)
326        .parse_next(input)
327}
328
329// mll-content = mll-char / newline
330fn mll_content(input: &mut Input<'_>) -> ModalResult<u8> {
331    alt((one_of(MLL_CHAR), newline.value(b'\n'))).parse_next(input)
332}
333
334// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
335const MLL_CHAR: (
336    u8,
337    RangeInclusive<u8>,
338    RangeInclusive<u8>,
339    RangeInclusive<u8>,
340) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
341
342// mll-quotes = 1*2apostrophe
343fn mll_quotes<'i>(
344    mut term: impl ModalParser<Input<'i>, (), ContextError>,
345) -> impl ModalParser<Input<'i>, &'i str, ContextError> {
346    move |input: &mut Input<'i>| {
347        let start = input.checkpoint();
348        let res = terminated(b"''", peek(term.by_ref()))
349            .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
350            .parse_next(input);
351
352        match res {
353            Err(winnow::error::ErrMode::Backtrack(_)) => {
354                input.reset(&start);
355                terminated(b"'", peek(term.by_ref()))
356                    .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
357                    .parse_next(input)
358            }
359            res => res,
360        }
361    }
362}
363
364#[cfg(test)]
365#[cfg(feature = "parse")]
366#[cfg(feature = "display")]
367mod test {
368    use super::*;
369
370    #[test]
371    fn basic_string() {
372        let input =
373            r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""#;
374        let expected = "I\'m a string. \"You can quote me\". Name\tJosé\nLocation\tSF. \u{2070E}";
375        let parsed = string.parse(new_input(input));
376        assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
377    }
378
379    #[test]
380    fn ml_basic_string() {
381        let cases = [
382            (
383                r#""""
384Roses are red
385Violets are blue""""#,
386                r#"Roses are red
387Violets are blue"#,
388            ),
389            (r#"""" \""" """"#, " \"\"\" "),
390            (r#"""" \\""""#, " \\"),
391        ];
392
393        for &(input, expected) in &cases {
394            let parsed = string.parse(new_input(input));
395            assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
396        }
397
398        let invalid_cases = [r#""""  """#, r#""""  \""""#];
399
400        for input in &invalid_cases {
401            let parsed = string.parse(new_input(input));
402            assert!(parsed.is_err());
403        }
404    }
405
406    #[test]
407    fn ml_basic_string_escape_ws() {
408        let inputs = [
409            r#""""
410The quick brown \
411
412
413  fox jumps over \
414    the lazy dog.""""#,
415            r#""""\
416       The quick brown \
417       fox jumps over \
418       the lazy dog.\
419       """"#,
420        ];
421        for input in &inputs {
422            let expected = "The quick brown fox jumps over the lazy dog.";
423            let parsed = string.parse(new_input(input));
424            assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
425        }
426        let empties = [
427            r#""""\
428       """"#,
429            r#""""
430\
431  \
432""""#,
433        ];
434        for input in &empties {
435            let expected = "";
436            let parsed = string.parse(new_input(input));
437            assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
438        }
439    }
440
441    #[test]
442    fn literal_string() {
443        let inputs = [
444            r"'C:\Users\nodejs\templates'",
445            r"'\\ServerX\admin$\system32\'",
446            r#"'Tom "Dubs" Preston-Werner'"#,
447            r"'<\i\c*\s*>'",
448        ];
449
450        for input in &inputs {
451            let expected = &input[1..input.len() - 1];
452            let parsed = string.parse(new_input(input));
453            assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
454        }
455    }
456
457    #[test]
458    fn ml_literal_string() {
459        let inputs = [
460            r"'''I [dw]on't need \d{2} apples'''",
461            r#"''''one_quote''''"#,
462        ];
463        for input in &inputs {
464            let expected = &input[3..input.len() - 3];
465            let parsed = string.parse(new_input(input));
466            assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
467        }
468
469        let input = r#"'''
470The first newline is
471trimmed in raw strings.
472   All other whitespace
473   is preserved.
474'''"#;
475        let expected = &input[4..input.len() - 3];
476        let parsed = string.parse(new_input(input));
477        assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
478    }
479}