Make Tokenizer emit spans

This commit is contained in:
John-John Tedro 2018-05-01 13:43:02 +02:00
parent 90b5e83a83
commit 6d7c89169b
2 changed files with 110 additions and 75 deletions

View file

@ -1025,7 +1025,7 @@ impl<'a> Deserializer<'a> {
} }
fn table_key(&mut self) -> Result<Cow<'a, str>, Error> { fn table_key(&mut self) -> Result<Cow<'a, str>, Error> {
self.tokens.table_key().map_err(|e| self.token_error(e)) self.tokens.table_key().map(|t| t.1).map_err(|e| self.token_error(e))
} }
fn eat_whitespace(&mut self) -> Result<(), Error> { fn eat_whitespace(&mut self) -> Result<(), Error> {
@ -1049,11 +1049,11 @@ impl<'a> Deserializer<'a> {
} }
fn next(&mut self) -> Result<Option<Token<'a>>, Error> { fn next(&mut self) -> Result<Option<Token<'a>>, Error> {
self.tokens.next().map_err(|e| self.token_error(e)) self.tokens.next().map(|t| t.map(|t| t.1)).map_err(|e| self.token_error(e))
} }
fn peek(&mut self) -> Result<Option<Token<'a>>, Error> { fn peek(&mut self) -> Result<Option<Token<'a>>, Error> {
self.tokens.peek().map_err(|e| self.token_error(e)) self.tokens.peek().map(|t| t.map(|t| t.1)).map_err(|e| self.token_error(e))
} }
fn eof(&self) -> Error { fn eof(&self) -> Error {
@ -1281,7 +1281,7 @@ impl<'a> Header<'a> {
if self.first || self.tokens.eat(Token::Period)? { if self.first || self.tokens.eat(Token::Period)? {
self.first = false; self.first = false;
self.tokens.eat_whitespace()?; self.tokens.eat_whitespace()?;
self.tokens.table_key().map(Some) self.tokens.table_key().map(|t| t.1).map(Some)
} else { } else {
self.tokens.expect(Token::RightBracket)?; self.tokens.expect(Token::RightBracket)?;
if self.array { if self.array {

View file

@ -5,6 +5,21 @@ use std::string;
use self::Token::*; use self::Token::*;
/// A span, designating a range of bytes where a token is located.
#[derive(Eq, PartialEq, Debug, Clone, Copy)]
pub struct Span {
/// The start of the range.
pub start: usize,
/// The end of the range (exclusive).
pub end: usize,
}
impl From<Span> for (usize, usize) {
fn from(Span { start, end }: Span) -> (usize, usize) {
(start, end)
}
}
#[derive(Eq, PartialEq, Debug)] #[derive(Eq, PartialEq, Debug)]
pub enum Token<'a> { pub enum Token<'a> {
Whitespace(&'a str), Whitespace(&'a str),
@ -69,38 +84,42 @@ impl<'a> Tokenizer<'a> {
t t
} }
pub fn next(&mut self) -> Result<Option<Token<'a>>, Error> { pub fn next(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
let token = match self.chars.next() { let (start, token) = match self.one() {
Some((_, '\n')) => Newline, Some((start, '\n')) => (start, Newline),
Some((start, ' ')) => self.whitespace_token(start), Some((start, ' ')) => (start, self.whitespace_token(start)),
Some((start, '\t')) => self.whitespace_token(start), Some((start, '\t')) => (start, self.whitespace_token(start)),
Some((start, '#')) => self.comment_token(start), Some((start, '#')) => (start, self.comment_token(start)),
Some((_, '=')) => Equals, Some((start, '=')) => (start, Equals),
Some((_, '.')) => Period, Some((start, '.')) => (start, Period),
Some((_, ',')) => Comma, Some((start, ',')) => (start, Comma),
Some((_, ':')) => Colon, Some((start, ':')) => (start, Colon),
Some((_, '+')) => Plus, Some((start, '+')) => (start, Plus),
Some((_, '{')) => LeftBrace, Some((start, '{')) => (start, LeftBrace),
Some((_, '}')) => RightBrace, Some((start, '}')) => (start, RightBrace),
Some((_, '[')) => LeftBracket, Some((start, '[')) => (start, LeftBracket),
Some((_, ']')) => RightBracket, Some((start, ']')) => (start, RightBracket),
Some((start, '\'')) => return self.literal_string(start).map(Some), Some((start, '\'')) => return self.literal_string(start)
Some((start, '"')) => return self.basic_string(start).map(Some), .map(|t| Some((self.step_span(start), t))),
Some((start, ch)) if is_keylike(ch) => self.keylike(start), Some((start, '"')) => return self.basic_string(start)
.map(|t| Some((self.step_span(start), t))),
Some((start, ch)) if is_keylike(ch) => (start, self.keylike(start)),
Some((start, ch)) => return Err(Error::Unexpected(start, ch)), Some((start, ch)) => return Err(Error::Unexpected(start, ch)),
None => return Ok(None), None => return Ok(None),
}; };
Ok(Some(token))
let span = self.step_span(start);
Ok(Some((span, token)))
} }
pub fn peek(&mut self) -> Result<Option<Token<'a>>, Error> { pub fn peek(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
self.clone().next() self.clone().next()
} }
pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> { pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> {
match self.peek()? { match self.peek()? {
Some(ref found) if expected == *found => {} Some((_, ref found)) if expected == *found => {}
Some(_) => return Ok(false), Some(_) => return Ok(false),
None => return Ok(false), None => return Ok(false),
} }
@ -111,7 +130,7 @@ impl<'a> Tokenizer<'a> {
pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> { pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> {
let current = self.current(); let current = self.current();
match self.next()? { match self.next()? {
Some(found) => { Some((_, found)) => {
if expected == found { if expected == found {
Ok(()) Ok(())
} else { } else {
@ -132,21 +151,21 @@ impl<'a> Tokenizer<'a> {
} }
} }
pub fn table_key(&mut self) -> Result<Cow<'a, str>, Error> { pub fn table_key(&mut self) -> Result<(Span, Cow<'a, str>), Error> {
let current = self.current(); let current = self.current();
match self.next()? { match self.next()? {
Some(Token::Keylike(k)) => Ok(k.into()), Some((span, Token::Keylike(k))) => Ok((span, k.into())),
Some(Token::String { src, val }) => { Some((span, Token::String { src, val })) => {
let offset = self.substr_offset(src); let offset = self.substr_offset(src);
if val == "" { if val == "" {
return Err(Error::EmptyTableKey(offset)) return Err(Error::EmptyTableKey(offset))
} }
match src.find('\n') { match src.find('\n') {
None => Ok(val), None => Ok((span, val)),
Some(i) => Err(Error::NewlineInTableKey(offset + i)), Some(i) => Err(Error::NewlineInTableKey(offset + i)),
} }
} }
Some(other) => { Some((_, other)) => {
Err(Error::Wanted { Err(Error::Wanted {
at: current, at: current,
expected: "a table key", expected: "a table key",
@ -182,8 +201,8 @@ impl<'a> Tokenizer<'a> {
let current = self.current(); let current = self.current();
match self.next()? { match self.next()? {
None | None |
Some(Token::Newline) => Ok(()), Some((_, Token::Newline)) => Ok(()),
Some(other) => { Some((_, other)) => {
Err(Error::Wanted { Err(Error::Wanted {
at: current, at: current,
expected: "newline", expected: "newline",
@ -195,7 +214,7 @@ impl<'a> Tokenizer<'a> {
pub fn skip_to_newline(&mut self) { pub fn skip_to_newline(&mut self) {
loop { loop {
match self.chars.next() { match self.one() {
Some((_, '\n')) | Some((_, '\n')) |
None => break, None => break,
_ => {} _ => {}
@ -206,7 +225,7 @@ impl<'a> Tokenizer<'a> {
fn eatc(&mut self, ch: char) -> bool { fn eatc(&mut self, ch: char) -> bool {
match self.chars.clone().next() { match self.chars.clone().next() {
Some((_, ch2)) if ch == ch2 => { Some((_, ch2)) if ch == ch2 => {
self.chars.next(); self.one();
true true
} }
_ => false, _ => false,
@ -233,7 +252,7 @@ impl<'a> Tokenizer<'a> {
if ch != '\t' && (ch < '\u{20}' || ch > '\u{10ffff}') { if ch != '\t' && (ch < '\u{20}' || ch > '\u{10ffff}') {
break break
} }
self.chars.next(); self.one();
} }
Comment(&self.input[start..self.current()]) Comment(&self.input[start..self.current()])
} }
@ -260,7 +279,7 @@ impl<'a> Tokenizer<'a> {
let mut n = 0; let mut n = 0;
'outer: loop { 'outer: loop {
n += 1; n += 1;
match self.chars.next() { match self.one() {
Some((i, '\n')) => { Some((i, '\n')) => {
if multiline { if multiline {
if self.input.as_bytes()[i] == b'\r' { if self.input.as_bytes()[i] == b'\r' {
@ -352,7 +371,7 @@ impl<'a> Tokenizer<'a> {
fn hex(&mut self, start: usize, i: usize, len: usize) -> Result<char, Error> { fn hex(&mut self, start: usize, i: usize, len: usize) -> Result<char, Error> {
let mut val = 0; let mut val = 0;
for _ in 0..len { for _ in 0..len {
match self.chars.next() { match self.one() {
Some((_, ch)) if '0' <= ch && ch <= '9' => { Some((_, ch)) if '0' <= ch && ch <= '9' => {
val = val * 16 + (ch as u32 - '0' as u32); val = val * 16 + (ch as u32 - '0' as u32);
} }
@ -370,11 +389,11 @@ impl<'a> Tokenizer<'a> {
} }
fn keylike(&mut self, start: usize) -> Token<'a> { fn keylike(&mut self, start: usize) -> Token<'a> {
while let Some((_, ch)) = self.chars.clone().next() { while let Some((_, ch)) = self.peek_one() {
if !is_keylike(ch) { if !is_keylike(ch) {
break break
} }
self.chars.next(); self.one();
} }
Keylike(&self.input[start..self.current()]) Keylike(&self.input[start..self.current()])
} }
@ -386,6 +405,22 @@ impl<'a> Tokenizer<'a> {
assert!(a <= b); assert!(a <= b);
b - a b - a
} }
/// Calculate the span of a single character.
fn step_span(&mut self, start: usize) -> Span {
let end = self.peek_one().map(|t| t.0).unwrap_or_else(|| self.input.len());
Span { start: start, end: end }
}
/// Peek one char without consuming it.
fn peek_one(&mut self) -> Option<(usize, char)> {
self.chars.clone().next()
}
/// Take one char.
pub fn one(&mut self) -> Option<(usize, char)> {
self.chars.next()
}
} }
impl<'a> Iterator for CrlfFold<'a> { impl<'a> Iterator for CrlfFold<'a> {
@ -475,7 +510,7 @@ mod tests {
fn literal_strings() { fn literal_strings() {
fn t(input: &str, val: &str) { fn t(input: &str, val: &str) {
let mut t = Tokenizer::new(input); let mut t = Tokenizer::new(input);
let token = t.next().unwrap().unwrap(); let (_, token) = t.next().unwrap().unwrap();
assert_eq!(token, Token::String { assert_eq!(token, Token::String {
src: input, src: input,
val: Cow::Borrowed(val), val: Cow::Borrowed(val),
@ -497,7 +532,7 @@ mod tests {
fn basic_strings() { fn basic_strings() {
fn t(input: &str, val: &str) { fn t(input: &str, val: &str) {
let mut t = Tokenizer::new(input); let mut t = Tokenizer::new(input);
let token = t.next().unwrap().unwrap(); let (_, token) = t.next().unwrap().unwrap();
assert_eq!(token, Token::String { assert_eq!(token, Token::String {
src: input, src: input,
val: Cow::Borrowed(val), val: Cow::Borrowed(val),
@ -538,7 +573,7 @@ mod tests {
fn keylike() { fn keylike() {
fn t(input: &str) { fn t(input: &str) {
let mut t = Tokenizer::new(input); let mut t = Tokenizer::new(input);
let token = t.next().unwrap().unwrap(); let (_, token) = t.next().unwrap().unwrap();
assert_eq!(token, Token::Keylike(input)); assert_eq!(token, Token::Keylike(input));
assert!(t.next().unwrap().is_none()); assert!(t.next().unwrap().is_none());
} }
@ -554,11 +589,11 @@ mod tests {
#[test] #[test]
fn all() { fn all() {
fn t(input: &str, expected: &[Token]) { fn t(input: &str, expected: &[((usize, usize), Token, &str)]) {
let mut tokens = Tokenizer::new(input); let mut tokens = Tokenizer::new(input);
let mut actual = Vec::new(); let mut actual: Vec<((usize, usize), Token, &str)> = Vec::new();
while let Some(token) = tokens.next().unwrap() { while let Some((span, token)) = tokens.next().unwrap() {
actual.push(token); actual.push((span.into(), token, &input[span.start..span.end]));
} }
for (a, b) in actual.iter().zip(expected) { for (a, b) in actual.iter().zip(expected) {
assert_eq!(a, b); assert_eq!(a, b);
@ -567,37 +602,37 @@ mod tests {
} }
t(" a ", &[ t(" a ", &[
Token::Whitespace(" "), ((0, 1), Token::Whitespace(" "), " "),
Token::Keylike("a"), ((1, 2), Token::Keylike("a"), "a"),
Token::Whitespace(" "), ((2, 3), Token::Whitespace(" "), " "),
]); ]);
t(" a\t [[]] \t [] {} , . =\n# foo \r\n#foo \n ", &[ t(" a\t [[]] \t [] {} , . =\n# foo \r\n#foo \n ", &[
Token::Whitespace(" "), ((0, 1), Token::Whitespace(" "), " "),
Token::Keylike("a"), ((1, 2), Token::Keylike("a"), "a"),
Token::Whitespace("\t "), ((2, 4), Token::Whitespace("\t "), "\t "),
Token::LeftBracket, ((4, 5), Token::LeftBracket, "["),
Token::LeftBracket, ((5, 6), Token::LeftBracket, "["),
Token::RightBracket, ((6, 7), Token::RightBracket, "]"),
Token::RightBracket, ((7, 8), Token::RightBracket, "]"),
Token::Whitespace(" \t "), ((8, 11), Token::Whitespace(" \t "), " \t "),
Token::LeftBracket, ((11, 12), Token::LeftBracket, "["),
Token::RightBracket, ((12, 13), Token::RightBracket, "]"),
Token::Whitespace(" "), ((13, 14), Token::Whitespace(" "), " "),
Token::LeftBrace, ((14, 15), Token::LeftBrace, "{"),
Token::RightBrace, ((15, 16), Token::RightBrace, "}"),
Token::Whitespace(" "), ((16, 17), Token::Whitespace(" "), " "),
Token::Comma, ((17, 18), Token::Comma, ","),
Token::Whitespace(" "), ((18, 19), Token::Whitespace(" "), " "),
Token::Period, ((19, 20), Token::Period, "."),
Token::Whitespace(" "), ((20, 21), Token::Whitespace(" "), " "),
Token::Equals, ((21, 22), Token::Equals, "="),
Token::Newline, ((22, 23), Token::Newline, "\n"),
Token::Comment("# foo "), ((23, 29), Token::Comment("# foo "), "# foo "),
Token::Newline, ((29, 31), Token::Newline, "\r\n"),
Token::Comment("#foo "), ((31, 36), Token::Comment("#foo "), "#foo "),
Token::Newline, ((36, 37), Token::Newline, "\n"),
Token::Whitespace(" "), ((37, 38), Token::Whitespace(" "), " "),
]); ]);
} }