holey-bytes/hblang/src/lexer.rs

426 lines
12 KiB
Rust
Raw Normal View History

2024-05-19 11:20:42 -05:00
use std::simd::cmp::SimdPartialEq;
2024-05-11 09:04:13 -05:00
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2024-05-09 16:41:59 -05:00
pub struct Token {
pub kind: TokenKind,
pub start: u32,
pub end: u32,
}
impl Token {
pub fn range(&self) -> std::ops::Range<usize> {
self.start as usize..self.end as usize
}
}
2024-05-11 15:22:08 -05:00
macro_rules! gen_token_kind {
($(
#[$atts:meta])*
$vis:vis enum $name:ident {
#[patterns] $(
$pattern:ident,
)*
#[keywords] $(
$keyword:ident = $keyword_lit:literal,
)*
#[punkt] $(
$punkt:ident = $punkt_lit:literal,
)*
#[ops] $(
2024-05-15 03:37:39 -05:00
#[$prec:ident] $(
$op:ident = $op_lit:literal $(=> $assign:ident)?,
2024-05-11 15:22:08 -05:00
)*
)*
}
) => {
impl std::fmt::Display for $name {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
let s = match *self {
$( Self::$pattern => concat!('<', stringify!($pattern), '>'), )*
$( Self::$keyword => stringify!($keyword_lit), )*
$( Self::$punkt => stringify!($punkt_lit), )*
2024-05-15 03:37:39 -05:00
$($( Self::$op => $op_lit,
$(Self::$assign => concat!($op_lit, "="),)?)*)*
2024-05-11 15:22:08 -05:00
};
f.write_str(s)
}
}
2024-05-09 16:41:59 -05:00
2024-05-11 15:22:08 -05:00
impl $name {
#[inline(always)]
pub fn precedence(&self) -> Option<u8> {
Some(match self {
2024-05-15 03:37:39 -05:00
$($(Self::$op => ${ignore($prec)} ${index(1)},
$(Self::$assign => 0,)?)*)*
2024-05-11 15:22:08 -05:00
_ => return None,
2024-05-15 03:37:39 -05:00
} + 1)
2024-05-11 15:22:08 -05:00
}
#[inline(always)]
fn from_ident(ident: &[u8]) -> Self {
match ident {
$($keyword_lit => Self::$keyword,)*
_ => Self::Ident,
}
}
2024-05-15 03:37:39 -05:00
pub fn assign_op(&self) -> Option<Self> {
Some(match self {
$($($(Self::$assign => Self::$op,)?)*)*
_ => return None,
})
}
2024-05-11 15:22:08 -05:00
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
$vis enum $name {
$( $pattern, )*
$( $keyword, )*
$( $punkt, )*
2024-05-15 03:37:39 -05:00
$($( $op, $($assign,)? )*)*
2024-05-11 15:22:08 -05:00
}
};
2024-05-10 15:54:12 -05:00
}
2024-05-11 15:22:08 -05:00
gen_token_kind! {
pub enum TokenKind {
#[patterns]
2024-06-01 13:30:07 -05:00
CtIdent,
2024-05-11 15:22:08 -05:00
Ident,
Number,
Eof,
Error,
2024-05-14 16:07:32 -05:00
Driective,
2024-05-17 12:53:59 -05:00
String,
2024-05-11 15:22:08 -05:00
#[keywords]
Return = b"return",
If = b"if",
Else = b"else",
Loop = b"loop",
Break = b"break",
Continue = b"continue",
Fn = b"fn",
2024-05-12 05:16:40 -05:00
Struct = b"struct",
2024-05-12 13:10:50 -05:00
True = b"true",
2024-05-11 15:22:08 -05:00
#[punkt]
2024-05-12 05:16:40 -05:00
LParen = "(",
RParen = ")",
LBrace = "{",
RBrace = "}",
Semi = ";",
Colon = ":",
Comma = ",",
Dot = ".",
Ctor = ".{",
2024-05-15 03:37:39 -05:00
Tupl = ".(",
2024-05-11 15:22:08 -05:00
#[ops]
2024-05-15 03:37:39 -05:00
#[prec]
2024-05-12 05:16:40 -05:00
Decl = ":=",
2024-05-11 15:22:08 -05:00
Assign = "=",
2024-05-15 03:37:39 -05:00
#[prec]
Or = "||",
#[prec]
And = "&&",
#[prec]
Bor = "|" => BorAss,
#[prec]
Xor = "^" => XorAss,
#[prec]
Band = "&" => BandAss,
#[prec]
Eq = "==",
Ne = "!=",
#[prec]
2024-05-11 15:22:08 -05:00
Le = "<=",
2024-05-13 06:36:29 -05:00
Ge = ">=",
Lt = "<",
Gt = ">",
2024-05-15 03:37:39 -05:00
#[prec]
Shl = "<<" => ShlAss,
Shr = ">>" => ShrAss,
#[prec]
Add = "+" => AddAss,
Sub = "-" => SubAss,
#[prec]
Mul = "*" => MulAss,
Div = "/" => DivAss,
Mod = "%" => ModAss,
2024-05-10 15:54:12 -05:00
}
}
2024-05-09 16:41:59 -05:00
pub struct Lexer<'a> {
pos: u32,
bytes: &'a [u8],
}
impl<'a> Lexer<'a> {
pub fn new(input: &'a str) -> Self {
2024-06-24 10:26:00 -05:00
Self::restore(input, 0)
}
pub fn restore(input: &'a str, pos: u32) -> Self {
2024-05-09 16:41:59 -05:00
Self {
2024-06-24 10:26:00 -05:00
pos,
2024-05-09 16:41:59 -05:00
bytes: input.as_bytes(),
}
}
2024-05-12 04:52:58 -05:00
pub fn slice(&self, tok: std::ops::Range<usize>) -> &'a str {
unsafe { std::str::from_utf8_unchecked(&self.bytes[tok]) }
2024-05-09 16:41:59 -05:00
}
fn peek(&self) -> Option<u8> {
self.bytes.get(self.pos as usize).copied()
}
fn advance(&mut self) -> Option<u8> {
let c = self.peek()?;
self.pos += 1;
Some(c)
}
pub fn next(&mut self) -> Token {
2024-05-15 03:37:39 -05:00
use TokenKind as T;
loop {
let mut start = self.pos;
let Some(c) = self.advance() else {
return Token {
kind: T::Eof,
start,
end: self.pos,
};
};
2024-06-01 13:30:07 -05:00
let advance_ident = |s: &mut Self| {
while let Some(b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_') = s.peek() {
s.advance();
}
};
2024-05-15 03:37:39 -05:00
let kind = match c {
b'\n' | b'\r' | b'\t' | b' ' => continue,
b'0'..=b'9' => {
while let Some(b'0'..=b'9') = self.peek() {
self.advance();
}
T::Number
}
2024-06-01 13:30:07 -05:00
b'@' => {
start += 1;
advance_ident(self);
T::Driective
}
b'$' => {
start += 1;
advance_ident(self);
T::CtIdent
}
b'a'..=b'z' | b'A'..=b'Z' | b'_' => {
advance_ident(self);
let ident = &self.bytes[start as usize..self.pos as usize];
T::from_ident(ident)
2024-05-15 03:37:39 -05:00
}
2024-05-17 12:53:59 -05:00
b'"' => {
while let Some(c) = self.advance() {
match c {
b'"' => break,
b'\\' => _ = self.advance(),
_ => {}
}
}
T::String
}
2024-05-15 03:37:39 -05:00
b':' if self.advance_if(b'=') => T::Decl,
b':' => T::Colon,
b',' => T::Comma,
b'.' if self.advance_if(b'{') => T::Ctor,
b'.' if self.advance_if(b'(') => T::Tupl,
b'.' => T::Dot,
b';' => T::Semi,
b'!' if self.advance_if(b'=') => T::Ne,
b'=' if self.advance_if(b'=') => T::Eq,
b'=' => T::Assign,
b'<' if self.advance_if(b'=') => T::Le,
b'<' if self.advance_if(b'<') => match self.advance_if(b'=') {
true => T::ShlAss,
false => T::Shl,
},
b'<' => T::Lt,
b'>' if self.advance_if(b'=') => T::Ge,
b'>' if self.advance_if(b'>') => match self.advance_if(b'=') {
true => T::ShrAss,
false => T::Shr,
},
b'>' => T::Gt,
b'+' if self.advance_if(b'=') => T::AddAss,
b'+' => T::Add,
b'-' if self.advance_if(b'=') => T::SubAss,
b'-' => T::Sub,
b'*' if self.advance_if(b'=') => T::MulAss,
b'*' => T::Mul,
b'/' if self.advance_if(b'=') => T::DivAss,
b'/' => T::Div,
b'%' if self.advance_if(b'=') => T::ModAss,
b'%' => T::Mod,
b'&' if self.advance_if(b'=') => T::BandAss,
b'&' if self.advance_if(b'&') => T::And,
b'&' => T::Band,
b'^' if self.advance_if(b'=') => T::XorAss,
b'^' => T::Xor,
b'|' if self.advance_if(b'=') => T::BorAss,
b'|' if self.advance_if(b'|') => T::Or,
b'|' => T::Bor,
b'(' => T::LParen,
b')' => T::RParen,
b'{' => T::LBrace,
b'}' => T::RBrace,
_ => T::Error,
};
return Token {
kind,
start,
end: self.pos,
};
}
2024-05-09 16:41:59 -05:00
}
fn advance_if(&mut self, arg: u8) -> bool {
if self.peek() == Some(arg) {
self.advance();
true
} else {
false
}
}
2024-05-12 16:19:45 -05:00
pub fn line_col(&self, pos: u32) -> (usize, usize) {
line_col(self.bytes, pos)
2024-05-09 16:41:59 -05:00
}
}
2024-05-19 11:20:42 -05:00
pub fn line_col(bytes: &[u8], pos: u32) -> (usize, usize) {
bytes[..pos as usize]
2024-05-12 16:19:45 -05:00
.split(|&b| b == b'\n')
2024-05-19 11:20:42 -05:00
.map(<[u8]>::len)
2024-05-12 16:19:45 -05:00
.enumerate()
2024-05-19 11:20:42 -05:00
.last()
.map(|(line, col)| (line + 1, col + 1))
2024-05-12 16:19:45 -05:00
.unwrap_or((1, 1))
}
2024-05-19 11:20:42 -05:00
pub struct LineMap {
lines: Box<[u8]>,
}
2024-05-09 16:41:59 -05:00
2024-05-19 11:20:42 -05:00
impl LineMap {
pub fn line_col(&self, mut pos: u32) -> (usize, usize) {
let mut line = 1;
2024-05-09 16:41:59 -05:00
2024-05-19 11:20:42 -05:00
let mut iter = self.lines.iter().copied();
2024-06-24 10:26:00 -05:00
loop {
2024-05-19 11:20:42 -05:00
let mut acc = 0;
2024-06-24 10:26:00 -05:00
let mut idx = 0;
loop {
let len = iter.next().unwrap();
acc |= ((len & 0x7F) as u32) << (7 * idx);
idx += 1;
if len & 0x80 == 0 {
break;
}
2024-05-19 11:20:42 -05:00
}
if pos < acc {
break;
}
pos = pos.saturating_sub(acc);
line += 1;
}
(line, pos as usize + 1)
}
pub fn new(input: &str) -> Self {
let bytes = input.as_bytes();
let (start, simd_mid, end) = bytes.as_simd::<16>();
let query = std::simd::u8x16::splat(b'\n');
let nl_count = start.iter().map(|&b| (b == b'\n') as usize).sum::<usize>()
+ simd_mid
.iter()
.map(|s| s.simd_eq(query).to_bitmask().count_ones())
.sum::<u32>() as usize
+ end.iter().map(|&b| (b == b'\n') as usize).sum::<usize>();
let mut lines = Vec::with_capacity(nl_count);
let mut last_nl = 0;
let handle_rem = |offset: usize, bytes: &[u8], last_nl: &mut usize, lines: &mut Vec<u8>| {
bytes
.iter()
.copied()
.enumerate()
.filter_map(|(i, b)| (b == b'\n').then_some(i + offset))
.for_each(|i| {
lines.push((i - *last_nl + 1) as u8);
*last_nl = i + 1;
});
};
handle_rem(0, start, &mut last_nl, &mut lines);
for (i, simd) in simd_mid.iter().enumerate() {
let mask = simd.simd_eq(query);
let mut mask = mask.to_bitmask();
while mask != 0 {
let idx = mask.trailing_zeros() as usize + i * 16 + start.len();
let mut len = idx - last_nl + 1;
while len >= 0x80 {
2024-06-24 10:26:00 -05:00
lines.push(0x80 | (len & 0x7F) as u8);
2024-05-19 11:20:42 -05:00
len >>= 7;
2024-05-09 16:41:59 -05:00
}
2024-05-19 11:20:42 -05:00
lines.push(len as u8);
last_nl = idx + 1;
mask &= mask - 1;
}
}
2024-05-09 16:41:59 -05:00
2024-05-19 11:20:42 -05:00
handle_rem(bytes.len() - end.len(), end, &mut last_nl, &mut lines);
Self {
lines: Box::from(lines),
2024-05-09 16:41:59 -05:00
}
}
}
2024-06-24 10:26:00 -05:00
#[cfg(test)]
mod test {
#[test]
fn test_smh() {
let example = include_str!("../README.md");
let nlines = super::LineMap::new(example);
fn slow_nline_search(str: &str, mut pos: usize) -> (usize, usize) {
(
str.lines()
.take_while(|l| match pos.checked_sub(l.len() + 1) {
Some(nl) => (pos = nl, true).1,
None => false,
})
.count()
+ 1,
pos + 1,
)
}
for i in 0..example.len() {
assert_eq!(slow_nline_search(example, i), nlines.line_col(i as _));
}
}
}