Make Tokenizer emit spans
This commit is contained in:
parent
90b5e83a83
commit
6d7c89169b
|
@ -1025,7 +1025,7 @@ impl<'a> Deserializer<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn table_key(&mut self) -> Result<Cow<'a, str>, Error> {
|
fn table_key(&mut self) -> Result<Cow<'a, str>, Error> {
|
||||||
self.tokens.table_key().map_err(|e| self.token_error(e))
|
self.tokens.table_key().map(|t| t.1).map_err(|e| self.token_error(e))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn eat_whitespace(&mut self) -> Result<(), Error> {
|
fn eat_whitespace(&mut self) -> Result<(), Error> {
|
||||||
|
@ -1049,11 +1049,11 @@ impl<'a> Deserializer<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn next(&mut self) -> Result<Option<Token<'a>>, Error> {
|
fn next(&mut self) -> Result<Option<Token<'a>>, Error> {
|
||||||
self.tokens.next().map_err(|e| self.token_error(e))
|
self.tokens.next().map(|t| t.map(|t| t.1)).map_err(|e| self.token_error(e))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn peek(&mut self) -> Result<Option<Token<'a>>, Error> {
|
fn peek(&mut self) -> Result<Option<Token<'a>>, Error> {
|
||||||
self.tokens.peek().map_err(|e| self.token_error(e))
|
self.tokens.peek().map(|t| t.map(|t| t.1)).map_err(|e| self.token_error(e))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn eof(&self) -> Error {
|
fn eof(&self) -> Error {
|
||||||
|
@ -1281,7 +1281,7 @@ impl<'a> Header<'a> {
|
||||||
if self.first || self.tokens.eat(Token::Period)? {
|
if self.first || self.tokens.eat(Token::Period)? {
|
||||||
self.first = false;
|
self.first = false;
|
||||||
self.tokens.eat_whitespace()?;
|
self.tokens.eat_whitespace()?;
|
||||||
self.tokens.table_key().map(Some)
|
self.tokens.table_key().map(|t| t.1).map(Some)
|
||||||
} else {
|
} else {
|
||||||
self.tokens.expect(Token::RightBracket)?;
|
self.tokens.expect(Token::RightBracket)?;
|
||||||
if self.array {
|
if self.array {
|
||||||
|
|
177
src/tokens.rs
177
src/tokens.rs
|
@ -5,6 +5,21 @@ use std::string;
|
||||||
|
|
||||||
use self::Token::*;
|
use self::Token::*;
|
||||||
|
|
||||||
|
/// A span, designating a range of bytes where a token is located.
|
||||||
|
#[derive(Eq, PartialEq, Debug, Clone, Copy)]
|
||||||
|
pub struct Span {
|
||||||
|
/// The start of the range.
|
||||||
|
pub start: usize,
|
||||||
|
/// The end of the range (exclusive).
|
||||||
|
pub end: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<Span> for (usize, usize) {
|
||||||
|
fn from(Span { start, end }: Span) -> (usize, usize) {
|
||||||
|
(start, end)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Eq, PartialEq, Debug)]
|
#[derive(Eq, PartialEq, Debug)]
|
||||||
pub enum Token<'a> {
|
pub enum Token<'a> {
|
||||||
Whitespace(&'a str),
|
Whitespace(&'a str),
|
||||||
|
@ -69,38 +84,42 @@ impl<'a> Tokenizer<'a> {
|
||||||
t
|
t
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn next(&mut self) -> Result<Option<Token<'a>>, Error> {
|
pub fn next(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
|
||||||
let token = match self.chars.next() {
|
let (start, token) = match self.one() {
|
||||||
Some((_, '\n')) => Newline,
|
Some((start, '\n')) => (start, Newline),
|
||||||
Some((start, ' ')) => self.whitespace_token(start),
|
Some((start, ' ')) => (start, self.whitespace_token(start)),
|
||||||
Some((start, '\t')) => self.whitespace_token(start),
|
Some((start, '\t')) => (start, self.whitespace_token(start)),
|
||||||
Some((start, '#')) => self.comment_token(start),
|
Some((start, '#')) => (start, self.comment_token(start)),
|
||||||
Some((_, '=')) => Equals,
|
Some((start, '=')) => (start, Equals),
|
||||||
Some((_, '.')) => Period,
|
Some((start, '.')) => (start, Period),
|
||||||
Some((_, ',')) => Comma,
|
Some((start, ',')) => (start, Comma),
|
||||||
Some((_, ':')) => Colon,
|
Some((start, ':')) => (start, Colon),
|
||||||
Some((_, '+')) => Plus,
|
Some((start, '+')) => (start, Plus),
|
||||||
Some((_, '{')) => LeftBrace,
|
Some((start, '{')) => (start, LeftBrace),
|
||||||
Some((_, '}')) => RightBrace,
|
Some((start, '}')) => (start, RightBrace),
|
||||||
Some((_, '[')) => LeftBracket,
|
Some((start, '[')) => (start, LeftBracket),
|
||||||
Some((_, ']')) => RightBracket,
|
Some((start, ']')) => (start, RightBracket),
|
||||||
Some((start, '\'')) => return self.literal_string(start).map(Some),
|
Some((start, '\'')) => return self.literal_string(start)
|
||||||
Some((start, '"')) => return self.basic_string(start).map(Some),
|
.map(|t| Some((self.step_span(start), t))),
|
||||||
Some((start, ch)) if is_keylike(ch) => self.keylike(start),
|
Some((start, '"')) => return self.basic_string(start)
|
||||||
|
.map(|t| Some((self.step_span(start), t))),
|
||||||
|
Some((start, ch)) if is_keylike(ch) => (start, self.keylike(start)),
|
||||||
|
|
||||||
Some((start, ch)) => return Err(Error::Unexpected(start, ch)),
|
Some((start, ch)) => return Err(Error::Unexpected(start, ch)),
|
||||||
None => return Ok(None),
|
None => return Ok(None),
|
||||||
};
|
};
|
||||||
Ok(Some(token))
|
|
||||||
|
let span = self.step_span(start);
|
||||||
|
Ok(Some((span, token)))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn peek(&mut self) -> Result<Option<Token<'a>>, Error> {
|
pub fn peek(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
|
||||||
self.clone().next()
|
self.clone().next()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> {
|
pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> {
|
||||||
match self.peek()? {
|
match self.peek()? {
|
||||||
Some(ref found) if expected == *found => {}
|
Some((_, ref found)) if expected == *found => {}
|
||||||
Some(_) => return Ok(false),
|
Some(_) => return Ok(false),
|
||||||
None => return Ok(false),
|
None => return Ok(false),
|
||||||
}
|
}
|
||||||
|
@ -111,7 +130,7 @@ impl<'a> Tokenizer<'a> {
|
||||||
pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> {
|
pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> {
|
||||||
let current = self.current();
|
let current = self.current();
|
||||||
match self.next()? {
|
match self.next()? {
|
||||||
Some(found) => {
|
Some((_, found)) => {
|
||||||
if expected == found {
|
if expected == found {
|
||||||
Ok(())
|
Ok(())
|
||||||
} else {
|
} else {
|
||||||
|
@ -132,21 +151,21 @@ impl<'a> Tokenizer<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn table_key(&mut self) -> Result<Cow<'a, str>, Error> {
|
pub fn table_key(&mut self) -> Result<(Span, Cow<'a, str>), Error> {
|
||||||
let current = self.current();
|
let current = self.current();
|
||||||
match self.next()? {
|
match self.next()? {
|
||||||
Some(Token::Keylike(k)) => Ok(k.into()),
|
Some((span, Token::Keylike(k))) => Ok((span, k.into())),
|
||||||
Some(Token::String { src, val }) => {
|
Some((span, Token::String { src, val })) => {
|
||||||
let offset = self.substr_offset(src);
|
let offset = self.substr_offset(src);
|
||||||
if val == "" {
|
if val == "" {
|
||||||
return Err(Error::EmptyTableKey(offset))
|
return Err(Error::EmptyTableKey(offset))
|
||||||
}
|
}
|
||||||
match src.find('\n') {
|
match src.find('\n') {
|
||||||
None => Ok(val),
|
None => Ok((span, val)),
|
||||||
Some(i) => Err(Error::NewlineInTableKey(offset + i)),
|
Some(i) => Err(Error::NewlineInTableKey(offset + i)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some(other) => {
|
Some((_, other)) => {
|
||||||
Err(Error::Wanted {
|
Err(Error::Wanted {
|
||||||
at: current,
|
at: current,
|
||||||
expected: "a table key",
|
expected: "a table key",
|
||||||
|
@ -182,8 +201,8 @@ impl<'a> Tokenizer<'a> {
|
||||||
let current = self.current();
|
let current = self.current();
|
||||||
match self.next()? {
|
match self.next()? {
|
||||||
None |
|
None |
|
||||||
Some(Token::Newline) => Ok(()),
|
Some((_, Token::Newline)) => Ok(()),
|
||||||
Some(other) => {
|
Some((_, other)) => {
|
||||||
Err(Error::Wanted {
|
Err(Error::Wanted {
|
||||||
at: current,
|
at: current,
|
||||||
expected: "newline",
|
expected: "newline",
|
||||||
|
@ -195,7 +214,7 @@ impl<'a> Tokenizer<'a> {
|
||||||
|
|
||||||
pub fn skip_to_newline(&mut self) {
|
pub fn skip_to_newline(&mut self) {
|
||||||
loop {
|
loop {
|
||||||
match self.chars.next() {
|
match self.one() {
|
||||||
Some((_, '\n')) |
|
Some((_, '\n')) |
|
||||||
None => break,
|
None => break,
|
||||||
_ => {}
|
_ => {}
|
||||||
|
@ -206,7 +225,7 @@ impl<'a> Tokenizer<'a> {
|
||||||
fn eatc(&mut self, ch: char) -> bool {
|
fn eatc(&mut self, ch: char) -> bool {
|
||||||
match self.chars.clone().next() {
|
match self.chars.clone().next() {
|
||||||
Some((_, ch2)) if ch == ch2 => {
|
Some((_, ch2)) if ch == ch2 => {
|
||||||
self.chars.next();
|
self.one();
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
_ => false,
|
_ => false,
|
||||||
|
@ -233,7 +252,7 @@ impl<'a> Tokenizer<'a> {
|
||||||
if ch != '\t' && (ch < '\u{20}' || ch > '\u{10ffff}') {
|
if ch != '\t' && (ch < '\u{20}' || ch > '\u{10ffff}') {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
self.chars.next();
|
self.one();
|
||||||
}
|
}
|
||||||
Comment(&self.input[start..self.current()])
|
Comment(&self.input[start..self.current()])
|
||||||
}
|
}
|
||||||
|
@ -260,7 +279,7 @@ impl<'a> Tokenizer<'a> {
|
||||||
let mut n = 0;
|
let mut n = 0;
|
||||||
'outer: loop {
|
'outer: loop {
|
||||||
n += 1;
|
n += 1;
|
||||||
match self.chars.next() {
|
match self.one() {
|
||||||
Some((i, '\n')) => {
|
Some((i, '\n')) => {
|
||||||
if multiline {
|
if multiline {
|
||||||
if self.input.as_bytes()[i] == b'\r' {
|
if self.input.as_bytes()[i] == b'\r' {
|
||||||
|
@ -352,7 +371,7 @@ impl<'a> Tokenizer<'a> {
|
||||||
fn hex(&mut self, start: usize, i: usize, len: usize) -> Result<char, Error> {
|
fn hex(&mut self, start: usize, i: usize, len: usize) -> Result<char, Error> {
|
||||||
let mut val = 0;
|
let mut val = 0;
|
||||||
for _ in 0..len {
|
for _ in 0..len {
|
||||||
match self.chars.next() {
|
match self.one() {
|
||||||
Some((_, ch)) if '0' <= ch && ch <= '9' => {
|
Some((_, ch)) if '0' <= ch && ch <= '9' => {
|
||||||
val = val * 16 + (ch as u32 - '0' as u32);
|
val = val * 16 + (ch as u32 - '0' as u32);
|
||||||
}
|
}
|
||||||
|
@ -370,11 +389,11 @@ impl<'a> Tokenizer<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn keylike(&mut self, start: usize) -> Token<'a> {
|
fn keylike(&mut self, start: usize) -> Token<'a> {
|
||||||
while let Some((_, ch)) = self.chars.clone().next() {
|
while let Some((_, ch)) = self.peek_one() {
|
||||||
if !is_keylike(ch) {
|
if !is_keylike(ch) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
self.chars.next();
|
self.one();
|
||||||
}
|
}
|
||||||
Keylike(&self.input[start..self.current()])
|
Keylike(&self.input[start..self.current()])
|
||||||
}
|
}
|
||||||
|
@ -386,6 +405,22 @@ impl<'a> Tokenizer<'a> {
|
||||||
assert!(a <= b);
|
assert!(a <= b);
|
||||||
b - a
|
b - a
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Calculate the span of a single character.
|
||||||
|
fn step_span(&mut self, start: usize) -> Span {
|
||||||
|
let end = self.peek_one().map(|t| t.0).unwrap_or_else(|| self.input.len());
|
||||||
|
Span { start: start, end: end }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Peek one char without consuming it.
|
||||||
|
fn peek_one(&mut self) -> Option<(usize, char)> {
|
||||||
|
self.chars.clone().next()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Take one char.
|
||||||
|
pub fn one(&mut self) -> Option<(usize, char)> {
|
||||||
|
self.chars.next()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterator for CrlfFold<'a> {
|
impl<'a> Iterator for CrlfFold<'a> {
|
||||||
|
@ -475,7 +510,7 @@ mod tests {
|
||||||
fn literal_strings() {
|
fn literal_strings() {
|
||||||
fn t(input: &str, val: &str) {
|
fn t(input: &str, val: &str) {
|
||||||
let mut t = Tokenizer::new(input);
|
let mut t = Tokenizer::new(input);
|
||||||
let token = t.next().unwrap().unwrap();
|
let (_, token) = t.next().unwrap().unwrap();
|
||||||
assert_eq!(token, Token::String {
|
assert_eq!(token, Token::String {
|
||||||
src: input,
|
src: input,
|
||||||
val: Cow::Borrowed(val),
|
val: Cow::Borrowed(val),
|
||||||
|
@ -497,7 +532,7 @@ mod tests {
|
||||||
fn basic_strings() {
|
fn basic_strings() {
|
||||||
fn t(input: &str, val: &str) {
|
fn t(input: &str, val: &str) {
|
||||||
let mut t = Tokenizer::new(input);
|
let mut t = Tokenizer::new(input);
|
||||||
let token = t.next().unwrap().unwrap();
|
let (_, token) = t.next().unwrap().unwrap();
|
||||||
assert_eq!(token, Token::String {
|
assert_eq!(token, Token::String {
|
||||||
src: input,
|
src: input,
|
||||||
val: Cow::Borrowed(val),
|
val: Cow::Borrowed(val),
|
||||||
|
@ -538,7 +573,7 @@ mod tests {
|
||||||
fn keylike() {
|
fn keylike() {
|
||||||
fn t(input: &str) {
|
fn t(input: &str) {
|
||||||
let mut t = Tokenizer::new(input);
|
let mut t = Tokenizer::new(input);
|
||||||
let token = t.next().unwrap().unwrap();
|
let (_, token) = t.next().unwrap().unwrap();
|
||||||
assert_eq!(token, Token::Keylike(input));
|
assert_eq!(token, Token::Keylike(input));
|
||||||
assert!(t.next().unwrap().is_none());
|
assert!(t.next().unwrap().is_none());
|
||||||
}
|
}
|
||||||
|
@ -554,11 +589,11 @@ mod tests {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn all() {
|
fn all() {
|
||||||
fn t(input: &str, expected: &[Token]) {
|
fn t(input: &str, expected: &[((usize, usize), Token, &str)]) {
|
||||||
let mut tokens = Tokenizer::new(input);
|
let mut tokens = Tokenizer::new(input);
|
||||||
let mut actual = Vec::new();
|
let mut actual: Vec<((usize, usize), Token, &str)> = Vec::new();
|
||||||
while let Some(token) = tokens.next().unwrap() {
|
while let Some((span, token)) = tokens.next().unwrap() {
|
||||||
actual.push(token);
|
actual.push((span.into(), token, &input[span.start..span.end]));
|
||||||
}
|
}
|
||||||
for (a, b) in actual.iter().zip(expected) {
|
for (a, b) in actual.iter().zip(expected) {
|
||||||
assert_eq!(a, b);
|
assert_eq!(a, b);
|
||||||
|
@ -567,37 +602,37 @@ mod tests {
|
||||||
}
|
}
|
||||||
|
|
||||||
t(" a ", &[
|
t(" a ", &[
|
||||||
Token::Whitespace(" "),
|
((0, 1), Token::Whitespace(" "), " "),
|
||||||
Token::Keylike("a"),
|
((1, 2), Token::Keylike("a"), "a"),
|
||||||
Token::Whitespace(" "),
|
((2, 3), Token::Whitespace(" "), " "),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
t(" a\t [[]] \t [] {} , . =\n# foo \r\n#foo \n ", &[
|
t(" a\t [[]] \t [] {} , . =\n# foo \r\n#foo \n ", &[
|
||||||
Token::Whitespace(" "),
|
((0, 1), Token::Whitespace(" "), " "),
|
||||||
Token::Keylike("a"),
|
((1, 2), Token::Keylike("a"), "a"),
|
||||||
Token::Whitespace("\t "),
|
((2, 4), Token::Whitespace("\t "), "\t "),
|
||||||
Token::LeftBracket,
|
((4, 5), Token::LeftBracket, "["),
|
||||||
Token::LeftBracket,
|
((5, 6), Token::LeftBracket, "["),
|
||||||
Token::RightBracket,
|
((6, 7), Token::RightBracket, "]"),
|
||||||
Token::RightBracket,
|
((7, 8), Token::RightBracket, "]"),
|
||||||
Token::Whitespace(" \t "),
|
((8, 11), Token::Whitespace(" \t "), " \t "),
|
||||||
Token::LeftBracket,
|
((11, 12), Token::LeftBracket, "["),
|
||||||
Token::RightBracket,
|
((12, 13), Token::RightBracket, "]"),
|
||||||
Token::Whitespace(" "),
|
((13, 14), Token::Whitespace(" "), " "),
|
||||||
Token::LeftBrace,
|
((14, 15), Token::LeftBrace, "{"),
|
||||||
Token::RightBrace,
|
((15, 16), Token::RightBrace, "}"),
|
||||||
Token::Whitespace(" "),
|
((16, 17), Token::Whitespace(" "), " "),
|
||||||
Token::Comma,
|
((17, 18), Token::Comma, ","),
|
||||||
Token::Whitespace(" "),
|
((18, 19), Token::Whitespace(" "), " "),
|
||||||
Token::Period,
|
((19, 20), Token::Period, "."),
|
||||||
Token::Whitespace(" "),
|
((20, 21), Token::Whitespace(" "), " "),
|
||||||
Token::Equals,
|
((21, 22), Token::Equals, "="),
|
||||||
Token::Newline,
|
((22, 23), Token::Newline, "\n"),
|
||||||
Token::Comment("# foo "),
|
((23, 29), Token::Comment("# foo "), "# foo "),
|
||||||
Token::Newline,
|
((29, 31), Token::Newline, "\r\n"),
|
||||||
Token::Comment("#foo "),
|
((31, 36), Token::Comment("#foo "), "#foo "),
|
||||||
Token::Newline,
|
((36, 37), Token::Newline, "\n"),
|
||||||
Token::Whitespace(" "),
|
((37, 38), Token::Whitespace(" "), " "),
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue