toml/
tokens.rs

1use std::borrow::Cow;
2use std::char;
3use std::str;
4use std::string;
5use std::string::String as StdString;
6
7use self::Token::*;
8
9/// A span, designating a range of bytes where a token is located.
10#[derive(Eq, PartialEq, Debug, Clone, Copy)]
11pub struct Span {
12    /// The start of the range.
13    pub start: usize,
14    /// The end of the range (exclusive).
15    pub end: usize,
16}
17
18impl From<Span> for (usize, usize) {
19    fn from(Span { start, end }: Span) -> (usize, usize) {
20        (start, end)
21    }
22}
23
24#[derive(Eq, PartialEq, Debug)]
25pub enum Token<'a> {
26    Whitespace(&'a str),
27    Newline,
28    Comment(&'a str),
29
30    Equals,
31    Period,
32    Comma,
33    Colon,
34    Plus,
35    LeftBrace,
36    RightBrace,
37    LeftBracket,
38    RightBracket,
39
40    Keylike(&'a str),
41    String {
42        src: &'a str,
43        val: Cow<'a, str>,
44        multiline: bool,
45    },
46}
47
48#[derive(Eq, PartialEq, Debug)]
49pub enum Error {
50    InvalidCharInString(usize, char),
51    InvalidEscape(usize, char),
52    InvalidHexEscape(usize, char),
53    InvalidEscapeValue(usize, u32),
54    NewlineInString(usize),
55    Unexpected(usize, char),
56    UnterminatedString(usize),
57    NewlineInTableKey(usize),
58    MultilineStringKey(usize),
59    Wanted {
60        at: usize,
61        expected: &'static str,
62        found: &'static str,
63    },
64}
65
66#[derive(Clone)]
67pub struct Tokenizer<'a> {
68    input: &'a str,
69    chars: CrlfFold<'a>,
70}
71
72#[derive(Clone)]
73struct CrlfFold<'a> {
74    chars: str::CharIndices<'a>,
75}
76
77#[derive(Debug)]
78enum MaybeString {
79    NotEscaped(usize),
80    Owned(string::String),
81}
82
83impl<'a> Tokenizer<'a> {
84    pub fn new(input: &'a str) -> Tokenizer<'a> {
85        let mut t = Tokenizer {
86            input,
87            chars: CrlfFold {
88                chars: input.char_indices(),
89            },
90        };
91        // Eat utf-8 BOM
92        t.eatc('\u{feff}');
93        t
94    }
95
96    pub fn next(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
97        let (start, token) = match self.one() {
98            Some((start, '\n')) => (start, Newline),
99            Some((start, ' ')) => (start, self.whitespace_token(start)),
100            Some((start, '\t')) => (start, self.whitespace_token(start)),
101            Some((start, '#')) => (start, self.comment_token(start)),
102            Some((start, '=')) => (start, Equals),
103            Some((start, '.')) => (start, Period),
104            Some((start, ',')) => (start, Comma),
105            Some((start, ':')) => (start, Colon),
106            Some((start, '+')) => (start, Plus),
107            Some((start, '{')) => (start, LeftBrace),
108            Some((start, '}')) => (start, RightBrace),
109            Some((start, '[')) => (start, LeftBracket),
110            Some((start, ']')) => (start, RightBracket),
111            Some((start, '\'')) => {
112                return self
113                    .literal_string(start)
114                    .map(|t| Some((self.step_span(start), t)))
115            }
116            Some((start, '"')) => {
117                return self
118                    .basic_string(start)
119                    .map(|t| Some((self.step_span(start), t)))
120            }
121            Some((start, ch)) if is_keylike(ch) => (start, self.keylike(start)),
122
123            Some((start, ch)) => return Err(Error::Unexpected(start, ch)),
124            None => return Ok(None),
125        };
126
127        let span = self.step_span(start);
128        Ok(Some((span, token)))
129    }
130
131    pub fn peek(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
132        self.clone().next()
133    }
134
135    pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> {
136        self.eat_spanned(expected).map(|s| s.is_some())
137    }
138
139    /// Eat a value, returning it's span if it was consumed.
140    pub fn eat_spanned(&mut self, expected: Token<'a>) -> Result<Option<Span>, Error> {
141        let span = match self.peek()? {
142            Some((span, ref found)) if expected == *found => span,
143            Some(_) => return Ok(None),
144            None => return Ok(None),
145        };
146
147        drop(self.next());
148        Ok(Some(span))
149    }
150
151    pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> {
152        // ignore span
153        let _ = self.expect_spanned(expected)?;
154        Ok(())
155    }
156
157    /// Expect the given token returning its span.
158    pub fn expect_spanned(&mut self, expected: Token<'a>) -> Result<Span, Error> {
159        let current = self.current();
160        match self.next()? {
161            Some((span, found)) => {
162                if expected == found {
163                    Ok(span)
164                } else {
165                    Err(Error::Wanted {
166                        at: current,
167                        expected: expected.describe(),
168                        found: found.describe(),
169                    })
170                }
171            }
172            None => Err(Error::Wanted {
173                at: self.input.len(),
174                expected: expected.describe(),
175                found: "eof",
176            }),
177        }
178    }
179
180    pub fn table_key(&mut self) -> Result<(Span, Cow<'a, str>), Error> {
181        let current = self.current();
182        match self.next()? {
183            Some((span, Token::Keylike(k))) => Ok((span, k.into())),
184            Some((
185                span,
186                Token::String {
187                    src,
188                    val,
189                    multiline,
190                },
191            )) => {
192                let offset = self.substr_offset(src);
193                if multiline {
194                    return Err(Error::MultilineStringKey(offset));
195                }
196                match src.find('\n') {
197                    None => Ok((span, val)),
198                    Some(i) => Err(Error::NewlineInTableKey(offset + i)),
199                }
200            }
201            Some((_, other)) => Err(Error::Wanted {
202                at: current,
203                expected: "a table key",
204                found: other.describe(),
205            }),
206            None => Err(Error::Wanted {
207                at: self.input.len(),
208                expected: "a table key",
209                found: "eof",
210            }),
211        }
212    }
213
214    pub fn eat_whitespace(&mut self) -> Result<(), Error> {
215        while self.eatc(' ') || self.eatc('\t') {
216            // ...
217        }
218        Ok(())
219    }
220
221    pub fn eat_comment(&mut self) -> Result<bool, Error> {
222        if !self.eatc('#') {
223            return Ok(false);
224        }
225        drop(self.comment_token(0));
226        self.eat_newline_or_eof().map(|()| true)
227    }
228
229    pub fn eat_newline_or_eof(&mut self) -> Result<(), Error> {
230        let current = self.current();
231        match self.next()? {
232            None | Some((_, Token::Newline)) => Ok(()),
233            Some((_, other)) => Err(Error::Wanted {
234                at: current,
235                expected: "newline",
236                found: other.describe(),
237            }),
238        }
239    }
240
241    pub fn skip_to_newline(&mut self) {
242        loop {
243            match self.one() {
244                Some((_, '\n')) | None => break,
245                _ => {}
246            }
247        }
248    }
249
250    fn eatc(&mut self, ch: char) -> bool {
251        match self.chars.clone().next() {
252            Some((_, ch2)) if ch == ch2 => {
253                self.one();
254                true
255            }
256            _ => false,
257        }
258    }
259
260    pub fn current(&mut self) -> usize {
261        self.chars
262            .clone()
263            .next()
264            .map(|i| i.0)
265            .unwrap_or_else(|| self.input.len())
266    }
267
268    pub fn input(&self) -> &'a str {
269        self.input
270    }
271
272    fn whitespace_token(&mut self, start: usize) -> Token<'a> {
273        while self.eatc(' ') || self.eatc('\t') {
274            // ...
275        }
276        Whitespace(&self.input[start..self.current()])
277    }
278
279    fn comment_token(&mut self, start: usize) -> Token<'a> {
280        while let Some((_, ch)) = self.chars.clone().next() {
281            if ch != '\t' && !('\u{20}'..='\u{10ffff}').contains(&ch) {
282                break;
283            }
284            self.one();
285        }
286        Comment(&self.input[start..self.current()])
287    }
288
289    #[allow(clippy::type_complexity)]
290    fn read_string(
291        &mut self,
292        delim: char,
293        start: usize,
294        new_ch: &mut dyn FnMut(
295            &mut Tokenizer<'_>,
296            &mut MaybeString,
297            bool,
298            usize,
299            char,
300        ) -> Result<(), Error>,
301    ) -> Result<Token<'a>, Error> {
302        let mut multiline = false;
303        if self.eatc(delim) {
304            if self.eatc(delim) {
305                multiline = true;
306            } else {
307                return Ok(String {
308                    src: &self.input[start..start + 2],
309                    val: Cow::Borrowed(""),
310                    multiline: false,
311                });
312            }
313        }
314        let mut val = MaybeString::NotEscaped(self.current());
315        let mut n = 0;
316        'outer: loop {
317            n += 1;
318            match self.one() {
319                Some((i, '\n')) => {
320                    if multiline {
321                        if self.input.as_bytes()[i] == b'\r' {
322                            val.to_owned(&self.input[..i]);
323                        }
324                        if n == 1 {
325                            val = MaybeString::NotEscaped(self.current());
326                        } else {
327                            val.push('\n');
328                        }
329                        continue;
330                    } else {
331                        return Err(Error::NewlineInString(i));
332                    }
333                }
334                Some((mut i, ch)) if ch == delim => {
335                    if multiline {
336                        if !self.eatc(delim) {
337                            val.push(delim);
338                            continue 'outer;
339                        }
340                        if !self.eatc(delim) {
341                            val.push(delim);
342                            val.push(delim);
343                            continue 'outer;
344                        }
345                        if self.eatc(delim) {
346                            val.push(delim);
347                            i += 1;
348                        }
349                        if self.eatc(delim) {
350                            val.push(delim);
351                            i += 1;
352                        }
353                    }
354                    return Ok(String {
355                        src: &self.input[start..self.current()],
356                        val: val.into_cow(&self.input[..i]),
357                        multiline,
358                    });
359                }
360                Some((i, c)) => new_ch(self, &mut val, multiline, i, c)?,
361                None => return Err(Error::UnterminatedString(start)),
362            }
363        }
364    }
365
366    fn literal_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
367        self.read_string('\'', start, &mut |_me, val, _multi, i, ch| {
368            if ch == '\u{09}' || (('\u{20}'..='\u{10ffff}').contains(&ch) && ch != '\u{7f}') {
369                val.push(ch);
370                Ok(())
371            } else {
372                Err(Error::InvalidCharInString(i, ch))
373            }
374        })
375    }
376
377    fn basic_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
378        self.read_string('"', start, &mut |me, val, multi, i, ch| match ch {
379            '\\' => {
380                val.to_owned(&me.input[..i]);
381                match me.chars.next() {
382                    Some((_, '"')) => val.push('"'),
383                    Some((_, '\\')) => val.push('\\'),
384                    Some((_, 'b')) => val.push('\u{8}'),
385                    Some((_, 'f')) => val.push('\u{c}'),
386                    Some((_, 'n')) => val.push('\n'),
387                    Some((_, 'r')) => val.push('\r'),
388                    Some((_, 't')) => val.push('\t'),
389                    Some((i, c @ 'u')) | Some((i, c @ 'U')) => {
390                        let len = if c == 'u' { 4 } else { 8 };
391                        val.push(me.hex(start, i, len)?);
392                    }
393                    Some((i, c @ ' ')) | Some((i, c @ '\t')) | Some((i, c @ '\n')) if multi => {
394                        if c != '\n' {
395                            while let Some((_, ch)) = me.chars.clone().next() {
396                                match ch {
397                                    ' ' | '\t' => {
398                                        me.chars.next();
399                                        continue;
400                                    }
401                                    '\n' => {
402                                        me.chars.next();
403                                        break;
404                                    }
405                                    _ => return Err(Error::InvalidEscape(i, c)),
406                                }
407                            }
408                        }
409                        while let Some((_, ch)) = me.chars.clone().next() {
410                            match ch {
411                                ' ' | '\t' | '\n' => {
412                                    me.chars.next();
413                                }
414                                _ => break,
415                            }
416                        }
417                    }
418                    Some((i, c)) => return Err(Error::InvalidEscape(i, c)),
419                    None => return Err(Error::UnterminatedString(start)),
420                }
421                Ok(())
422            }
423            ch if ch == '\u{09}' || (('\u{20}'..='\u{10ffff}').contains(&ch) && ch != '\u{7f}') => {
424                val.push(ch);
425                Ok(())
426            }
427            _ => Err(Error::InvalidCharInString(i, ch)),
428        })
429    }
430
431    fn hex(&mut self, start: usize, i: usize, len: usize) -> Result<char, Error> {
432        let mut buf = StdString::with_capacity(len);
433        for _ in 0..len {
434            match self.one() {
435                Some((_, ch)) if ch as u32 <= 0x7F && ch.is_ascii_hexdigit() => buf.push(ch),
436                Some((i, ch)) => return Err(Error::InvalidHexEscape(i, ch)),
437                None => return Err(Error::UnterminatedString(start)),
438            }
439        }
440        let val = u32::from_str_radix(&buf, 16).unwrap();
441        match char::from_u32(val) {
442            Some(ch) => Ok(ch),
443            None => Err(Error::InvalidEscapeValue(i, val)),
444        }
445    }
446
447    fn keylike(&mut self, start: usize) -> Token<'a> {
448        while let Some((_, ch)) = self.peek_one() {
449            if !is_keylike(ch) {
450                break;
451            }
452            self.one();
453        }
454        Keylike(&self.input[start..self.current()])
455    }
456
457    pub fn substr_offset(&self, s: &'a str) -> usize {
458        assert!(s.len() <= self.input.len());
459        let a = self.input.as_ptr() as usize;
460        let b = s.as_ptr() as usize;
461        assert!(a <= b);
462        b - a
463    }
464
465    /// Calculate the span of a single character.
466    fn step_span(&mut self, start: usize) -> Span {
467        let end = self
468            .peek_one()
469            .map(|t| t.0)
470            .unwrap_or_else(|| self.input.len());
471        Span { start, end }
472    }
473
474    /// Peek one char without consuming it.
475    fn peek_one(&mut self) -> Option<(usize, char)> {
476        self.chars.clone().next()
477    }
478
479    /// Take one char.
480    pub fn one(&mut self) -> Option<(usize, char)> {
481        self.chars.next()
482    }
483}
484
485impl<'a> Iterator for CrlfFold<'a> {
486    type Item = (usize, char);
487
488    fn next(&mut self) -> Option<(usize, char)> {
489        self.chars.next().map(|(i, c)| {
490            if c == '\r' {
491                let mut attempt = self.chars.clone();
492                if let Some((_, '\n')) = attempt.next() {
493                    self.chars = attempt;
494                    return (i, '\n');
495                }
496            }
497            (i, c)
498        })
499    }
500}
501
502impl MaybeString {
503    fn push(&mut self, ch: char) {
504        match *self {
505            MaybeString::NotEscaped(..) => {}
506            MaybeString::Owned(ref mut s) => s.push(ch),
507        }
508    }
509
510    #[allow(clippy::wrong_self_convention)]
511    fn to_owned(&mut self, input: &str) {
512        match *self {
513            MaybeString::NotEscaped(start) => {
514                *self = MaybeString::Owned(input[start..].to_owned());
515            }
516            MaybeString::Owned(..) => {}
517        }
518    }
519
520    fn into_cow(self, input: &str) -> Cow<'_, str> {
521        match self {
522            MaybeString::NotEscaped(start) => Cow::Borrowed(&input[start..]),
523            MaybeString::Owned(s) => Cow::Owned(s),
524        }
525    }
526}
527
528fn is_keylike(ch: char) -> bool {
529    ('A'..='Z').contains(&ch)
530        || ('a'..='z').contains(&ch)
531        || ('0'..='9').contains(&ch)
532        || ch == '-'
533        || ch == '_'
534}
535
536impl<'a> Token<'a> {
537    pub fn describe(&self) -> &'static str {
538        match *self {
539            Token::Keylike(_) => "an identifier",
540            Token::Equals => "an equals",
541            Token::Period => "a period",
542            Token::Comment(_) => "a comment",
543            Token::Newline => "a newline",
544            Token::Whitespace(_) => "whitespace",
545            Token::Comma => "a comma",
546            Token::RightBrace => "a right brace",
547            Token::LeftBrace => "a left brace",
548            Token::RightBracket => "a right bracket",
549            Token::LeftBracket => "a left bracket",
550            Token::String { multiline, .. } => {
551                if multiline {
552                    "a multiline string"
553                } else {
554                    "a string"
555                }
556            }
557            Token::Colon => "a colon",
558            Token::Plus => "a plus",
559        }
560    }
561}
562
563#[cfg(test)]
564mod tests {
565    use super::{Error, Token, Tokenizer};
566    use std::borrow::Cow;
567
568    fn err(input: &str, err: Error) {
569        let mut t = Tokenizer::new(input);
570        let token = t.next().unwrap_err();
571        assert_eq!(token, err);
572        assert!(t.next().unwrap().is_none());
573    }
574
575    #[test]
576    fn literal_strings() {
577        fn t(input: &str, val: &str, multiline: bool) {
578            let mut t = Tokenizer::new(input);
579            let (_, token) = t.next().unwrap().unwrap();
580            assert_eq!(
581                token,
582                Token::String {
583                    src: input,
584                    val: Cow::Borrowed(val),
585                    multiline,
586                }
587            );
588            assert!(t.next().unwrap().is_none());
589        }
590
591        t("''", "", false);
592        t("''''''", "", true);
593        t("'''\n'''", "", true);
594        t("'a'", "a", false);
595        t("'\"a'", "\"a", false);
596        t("''''a'''", "'a", true);
597        t("'''\n'a\n'''", "'a\n", true);
598        t("'''a\n'a\r\n'''", "a\n'a\n", true);
599    }
600
601    #[test]
602    fn basic_strings() {
603        fn t(input: &str, val: &str, multiline: bool) {
604            let mut t = Tokenizer::new(input);
605            let (_, token) = t.next().unwrap().unwrap();
606            assert_eq!(
607                token,
608                Token::String {
609                    src: input,
610                    val: Cow::Borrowed(val),
611                    multiline,
612                }
613            );
614            assert!(t.next().unwrap().is_none());
615        }
616
617        t(r#""""#, "", false);
618        t(r#""""""""#, "", true);
619        t(r#""a""#, "a", false);
620        t(r#""""a""""#, "a", true);
621        t(r#""\t""#, "\t", false);
622        t(r#""\u0000""#, "\0", false);
623        t(r#""\U00000000""#, "\0", false);
624        t(r#""\U000A0000""#, "\u{A0000}", false);
625        t(r#""\\t""#, "\\t", false);
626        t("\"\t\"", "\t", false);
627        t("\"\"\"\n\t\"\"\"", "\t", true);
628        t("\"\"\"\\\n\"\"\"", "", true);
629        t(
630            "\"\"\"\\\n     \t   \t  \\\r\n  \t \n  \t \r\n\"\"\"",
631            "",
632            true,
633        );
634        t(r#""\r""#, "\r", false);
635        t(r#""\n""#, "\n", false);
636        t(r#""\b""#, "\u{8}", false);
637        t(r#""a\fa""#, "a\u{c}a", false);
638        t(r#""\"a""#, "\"a", false);
639        t("\"\"\"\na\"\"\"", "a", true);
640        t("\"\"\"\n\"\"\"", "", true);
641        t(r#""""a\"""b""""#, "a\"\"\"b", true);
642        err(r#""\a"#, Error::InvalidEscape(2, 'a'));
643        err("\"\\\n", Error::InvalidEscape(2, '\n'));
644        err("\"\\\r\n", Error::InvalidEscape(2, '\n'));
645        err("\"\\", Error::UnterminatedString(0));
646        err("\"\u{0}", Error::InvalidCharInString(1, '\u{0}'));
647        err(r#""\U00""#, Error::InvalidHexEscape(5, '"'));
648        err(r#""\U00"#, Error::UnterminatedString(0));
649        err(r#""\uD800"#, Error::InvalidEscapeValue(2, 0xd800));
650        err(r#""\UFFFFFFFF"#, Error::InvalidEscapeValue(2, 0xffff_ffff));
651    }
652
653    #[test]
654    fn keylike() {
655        fn t(input: &str) {
656            let mut t = Tokenizer::new(input);
657            let (_, token) = t.next().unwrap().unwrap();
658            assert_eq!(token, Token::Keylike(input));
659            assert!(t.next().unwrap().is_none());
660        }
661        t("foo");
662        t("0bar");
663        t("bar0");
664        t("1234");
665        t("a-b");
666        t("a_B");
667        t("-_-");
668        t("___");
669    }
670
671    #[test]
672    fn all() {
673        fn t(input: &str, expected: &[((usize, usize), Token<'_>, &str)]) {
674            let mut tokens = Tokenizer::new(input);
675            let mut actual: Vec<((usize, usize), Token<'_>, &str)> = Vec::new();
676            while let Some((span, token)) = tokens.next().unwrap() {
677                actual.push((span.into(), token, &input[span.start..span.end]));
678            }
679            for (a, b) in actual.iter().zip(expected) {
680                assert_eq!(a, b);
681            }
682            assert_eq!(actual.len(), expected.len());
683        }
684
685        t(
686            " a ",
687            &[
688                ((0, 1), Token::Whitespace(" "), " "),
689                ((1, 2), Token::Keylike("a"), "a"),
690                ((2, 3), Token::Whitespace(" "), " "),
691            ],
692        );
693
694        t(
695            " a\t [[]] \t [] {} , . =\n# foo \r\n#foo \n ",
696            &[
697                ((0, 1), Token::Whitespace(" "), " "),
698                ((1, 2), Token::Keylike("a"), "a"),
699                ((2, 4), Token::Whitespace("\t "), "\t "),
700                ((4, 5), Token::LeftBracket, "["),
701                ((5, 6), Token::LeftBracket, "["),
702                ((6, 7), Token::RightBracket, "]"),
703                ((7, 8), Token::RightBracket, "]"),
704                ((8, 11), Token::Whitespace(" \t "), " \t "),
705                ((11, 12), Token::LeftBracket, "["),
706                ((12, 13), Token::RightBracket, "]"),
707                ((13, 14), Token::Whitespace(" "), " "),
708                ((14, 15), Token::LeftBrace, "{"),
709                ((15, 16), Token::RightBrace, "}"),
710                ((16, 17), Token::Whitespace(" "), " "),
711                ((17, 18), Token::Comma, ","),
712                ((18, 19), Token::Whitespace(" "), " "),
713                ((19, 20), Token::Period, "."),
714                ((20, 21), Token::Whitespace(" "), " "),
715                ((21, 22), Token::Equals, "="),
716                ((22, 23), Token::Newline, "\n"),
717                ((23, 29), Token::Comment("# foo "), "# foo "),
718                ((29, 31), Token::Newline, "\r\n"),
719                ((31, 36), Token::Comment("#foo "), "#foo "),
720                ((36, 37), Token::Newline, "\n"),
721                ((37, 38), Token::Whitespace(" "), " "),
722            ],
723        );
724    }
725
726    #[test]
727    fn bare_cr_bad() {
728        err("\r", Error::Unexpected(0, '\r'));
729        err("'\n", Error::NewlineInString(1));
730        err("'\u{0}", Error::InvalidCharInString(1, '\u{0}'));
731        err("'", Error::UnterminatedString(0));
732        err("\u{0}", Error::Unexpected(0, '\u{0}'));
733    }
734
735    #[test]
736    fn bad_comment() {
737        let mut t = Tokenizer::new("#\u{0}");
738        t.next().unwrap().unwrap();
739        assert_eq!(t.next(), Err(Error::Unexpected(1, '\u{0}')));
740        assert!(t.next().unwrap().is_none());
741    }
742}