holey-bytes/lang/src/lexer.rs

603 lines
17 KiB
Rust
Raw Normal View History

2024-06-25 11:39:59 -05:00
const fn ascii_mask(chars: &[u8]) -> u128 {
let mut eq = 0;
let mut i = 0;
while i < chars.len() {
let b = chars[i];
eq |= 1 << b;
i += 1;
}
eq
}
2024-05-11 09:04:13 -05:00
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2024-05-09 16:41:59 -05:00
pub struct Token {
2024-07-08 00:22:53 -05:00
pub kind: TokenKind,
2024-05-09 16:41:59 -05:00
pub start: u32,
2024-07-08 00:22:53 -05:00
pub end: u32,
2024-05-09 16:41:59 -05:00
}
impl Token {
2024-09-30 12:09:17 -05:00
pub fn range(&self) -> core::ops::Range<usize> {
2024-05-09 16:41:59 -05:00
self.start as usize..self.end as usize
}
}
2024-05-11 15:22:08 -05:00
macro_rules! gen_token_kind {
($(
#[$atts:meta])*
$vis:vis enum $name:ident {
#[patterns] $(
$pattern:ident,
)*
#[keywords] $(
$keyword:ident = $keyword_lit:literal,
)*
#[const_keywords] $(
$const_keyword:ident = $const_keyword_lit:literal,
)*
2024-05-11 15:22:08 -05:00
#[punkt] $(
$punkt:ident = $punkt_lit:literal,
)*
#[ops] $(
2024-05-15 03:37:39 -05:00
#[$prec:ident] $(
$op:ident = $op_lit:literal $(=> $assign:ident)?,
2024-05-11 15:22:08 -05:00
)*
)*
}
) => {
2024-09-30 12:09:17 -05:00
impl core::fmt::Display for $name {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
f.write_str(self.name())
}
}
impl $name {
pub const OPS: &[Self] = &[$($(Self::$op),*),*];
pub fn name(&self) -> &str {
let sf = unsafe { &*(self as *const _ as *const u8) } ;
match *self {
2024-05-11 15:22:08 -05:00
$( Self::$pattern => concat!('<', stringify!($pattern), '>'), )*
$( Self::$keyword => stringify!($keyword_lit), )*
$( Self::$const_keyword => concat!('$', $const_keyword_lit), )*
2024-05-11 15:22:08 -05:00
$( Self::$punkt => stringify!($punkt_lit), )*
2024-05-15 03:37:39 -05:00
$($( Self::$op => $op_lit,
$(Self::$assign => concat!($op_lit, "="),)?)*)*
2024-09-30 12:09:17 -05:00
_ => unsafe { core::str::from_utf8_unchecked(core::slice::from_ref(&sf)) },
}
2024-05-11 15:22:08 -05:00
}
2024-05-09 16:41:59 -05:00
2024-05-11 15:22:08 -05:00
#[inline(always)]
pub fn precedence(&self) -> Option<u8> {
Some(match self {
2024-05-15 03:37:39 -05:00
$($(Self::$op => ${ignore($prec)} ${index(1)},
$(Self::$assign => 0,)?)*)*
2024-05-11 15:22:08 -05:00
_ => return None,
2024-05-15 03:37:39 -05:00
} + 1)
2024-05-11 15:22:08 -05:00
}
#[allow(non_upper_case_globals)]
2024-05-11 15:22:08 -05:00
fn from_ident(ident: &[u8]) -> Self {
$(const $keyword: &[u8] = $keyword_lit.as_bytes();)*
2024-05-11 15:22:08 -05:00
match ident {
$($keyword => Self::$keyword,)*
2024-05-11 15:22:08 -05:00
_ => Self::Ident,
}
}
#[allow(non_upper_case_globals)]
fn from_ct_ident(ident: &[u8]) -> Self {
$(const $const_keyword: &[u8] = $const_keyword_lit.as_bytes();)*
match ident {
$($const_keyword => Self::$const_keyword,)*
_ => Self::CtIdent,
}
}
2024-05-11 15:22:08 -05:00
}
2024-06-25 11:39:59 -05:00
};
}
2024-10-10 01:35:17 -05:00
#[derive(PartialEq, Eq, Clone, Copy, Hash, PartialOrd, Ord)]
2024-06-25 11:39:59 -05:00
#[repr(u8)]
pub enum TokenKind {
2024-07-08 00:22:53 -05:00
Not = b'!',
DQuote = b'"',
Pound = b'#',
2024-06-25 11:39:59 -05:00
CtIdent = b'$',
2024-07-08 00:22:53 -05:00
Mod = b'%',
Band = b'&',
Quote = b'\'',
LParen = b'(',
RParen = b')',
Mul = b'*',
Add = b'+',
Comma = b',',
Sub = b'-',
Dot = b'.',
Div = b'/',
2024-06-25 12:12:35 -05:00
// Unused = 2-6
Shl = b'<' - 5,
2024-06-25 12:12:35 -05:00
// Unused = 8
Shr = b'>' - 5,
2024-07-08 00:22:53 -05:00
Colon = b':',
Semi = b';',
Lt = b'<',
Assign = b'=',
Gt = b'>',
Que = b'?',
2024-06-25 11:39:59 -05:00
Directive = b'@',
2024-06-25 12:46:48 -05:00
Comment,
2024-06-25 11:39:59 -05:00
Ident,
Number,
2024-10-29 07:36:12 -05:00
Float,
2024-06-25 11:39:59 -05:00
Eof,
2024-05-11 15:22:08 -05:00
Ct,
2024-11-17 09:25:39 -06:00
Ctor,
Tupl,
TArrow,
Or,
And,
// Unused = R-Z
LBrack = b'[',
BSlash = b'\\',
RBrack = b']',
Xor = b'^',
Under = b'_',
Tick = b'`',
2024-11-24 07:47:38 -06:00
Slf,
2024-06-25 11:39:59 -05:00
Return,
If,
2024-11-17 09:25:39 -06:00
Match,
2024-06-25 11:39:59 -05:00
Else,
Loop,
Break,
Continue,
Fn,
Struct,
2024-09-22 11:17:30 -05:00
Packed,
2024-11-17 09:25:39 -06:00
Enum,
Union,
2024-06-25 11:39:59 -05:00
True,
False,
2024-10-27 13:55:11 -05:00
Null,
Idk,
2024-11-03 03:15:03 -06:00
Die,
2024-11-24 11:50:55 -06:00
Defer,
2024-06-25 11:39:59 -05:00
CtLoop,
2024-06-25 11:39:59 -05:00
// Unused = a-z
2024-07-08 00:22:53 -05:00
LBrace = b'{',
Bor = b'|',
RBrace = b'}',
Tilde = b'~',
Decl = b':' + 128,
Eq = b'=' + 128,
Ne = b'!' + 128,
Le = b'<' + 128,
Ge = b'>' + 128,
BorAss = b'|' + 128,
AddAss = b'+' + 128,
SubAss = b'-' + 128,
MulAss = b'*' + 128,
DivAss = b'/' + 128,
ModAss = b'%' + 128,
XorAss = b'^' + 128,
2024-06-25 11:39:59 -05:00
BandAss = b'&' + 128,
ShrAss = b'>' - 5 + 128,
ShlAss = b'<' - 5 + 128,
2024-06-25 11:39:59 -05:00
}
2024-10-10 01:35:17 -05:00
impl core::fmt::Debug for TokenKind {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Display::fmt(self, f)
}
}
2024-06-25 11:39:59 -05:00
impl TokenKind {
2024-07-08 00:22:53 -05:00
pub fn ass_op(self) -> Option<Self> {
2024-06-25 11:39:59 -05:00
let id = (self as u8).saturating_sub(128);
if ascii_mask(b"|+-*/%^&79") & (1u128 << id) == 0 {
2024-06-25 11:39:59 -05:00
return None;
2024-05-11 15:22:08 -05:00
}
2024-09-30 12:09:17 -05:00
Some(unsafe { core::mem::transmute::<u8, Self>(id) })
2024-06-25 11:39:59 -05:00
}
2024-09-03 10:51:28 -05:00
pub fn is_comutative(self) -> bool {
use TokenKind as S;
matches!(self, S::Eq | S::Ne | S::Bor | S::Xor | S::Band | S::Add | S::Mul)
}
pub fn is_compatison(self) -> bool {
matches!(self, Self::Lt | Self::Gt | Self::Ge | Self::Le | Self::Ne | Self::Eq)
}
2024-10-29 07:36:12 -05:00
pub fn is_supported_float_op(self) -> bool {
matches!(
self,
Self::Add
| Self::Sub
| Self::Mul
| Self::Div
| Self::Eq
| Self::Ne
| Self::Le
| Self::Ge
| Self::Lt
| Self::Gt
)
}
pub fn apply_binop(self, a: i64, b: i64, float: bool) -> i64 {
if float {
debug_assert!(self.is_supported_float_op());
let [a, b] = [f64::from_bits(a as _), f64::from_bits(b as _)];
let res = match self {
Self::Add => a + b,
Self::Sub => a - b,
Self::Mul => a * b,
Self::Div => a / b,
Self::Eq => return (a == b) as i64,
Self::Ne => return (a != b) as i64,
Self::Lt => return (a < b) as i64,
Self::Gt => return (a > b) as i64,
Self::Le => return (a >= b) as i64,
Self::Ge => return (a <= b) as i64,
_ => todo!("floating point op: {self}"),
};
return res.to_bits() as _;
}
2024-09-03 10:51:28 -05:00
match self {
2024-09-15 13:14:56 -05:00
Self::Add => a.wrapping_add(b),
Self::Sub => a.wrapping_sub(b),
Self::Mul => a.wrapping_mul(b),
2024-10-27 05:32:34 -05:00
Self::Div if b == 0 => 0,
2024-09-15 13:14:56 -05:00
Self::Div => a.wrapping_div(b),
Self::Shl => a.wrapping_shl(b as _),
Self::Eq => (a == b) as i64,
Self::Ne => (a != b) as i64,
Self::Lt => (a < b) as i64,
Self::Gt => (a > b) as i64,
Self::Le => (a >= b) as i64,
Self::Ge => (a <= b) as i64,
2024-09-15 13:14:56 -05:00
Self::Band => a & b,
Self::Bor => a | b,
2024-10-22 00:20:08 -05:00
Self::Xor => a ^ b,
Self::Mod if b == 0 => 0,
Self::Mod => a.wrapping_rem(b),
Self::Shr => a.wrapping_shr(b as _),
2024-09-03 10:51:28 -05:00
s => todo!("{s}"),
}
}
2024-09-05 19:42:07 -05:00
pub fn is_homogenous(&self) -> bool {
self.precedence() != Self::Eq.precedence()
&& self.precedence() != Self::Gt.precedence()
&& self.precedence() != Self::Eof.precedence()
}
2024-09-15 13:14:56 -05:00
2024-10-29 08:24:31 -05:00
pub fn apply_unop(&self, value: i64, float: bool) -> i64 {
2024-09-15 13:14:56 -05:00
match self {
2024-10-29 08:24:31 -05:00
Self::Sub if float => (-f64::from_bits(value as _)).to_bits() as _,
2024-09-15 13:14:56 -05:00
Self::Sub => value.wrapping_neg(),
2024-11-11 15:14:54 -06:00
Self::Not => (value == 0) as _,
2024-10-29 08:24:31 -05:00
Self::Float if float => value,
Self::Float => (value as f64).to_bits() as _,
2024-11-12 12:02:29 -06:00
Self::Number if float => f64::from_bits(value as _) as _,
Self::Number => value,
2024-09-15 13:14:56 -05:00
s => todo!("{s}"),
}
}
pub fn closing(&self) -> Option<TokenKind> {
Some(match self {
Self::Ctor => Self::RBrace,
Self::Tupl => Self::RParen,
Self::LParen => Self::RParen,
Self::LBrack => Self::RBrack,
Self::LBrace => Self::RBrace,
_ => return None,
})
}
2024-05-10 15:54:12 -05:00
}
2024-05-11 15:22:08 -05:00
gen_token_kind! {
pub enum TokenKind {
#[patterns]
2024-06-01 13:30:07 -05:00
CtIdent,
2024-05-11 15:22:08 -05:00
Ident,
Number,
2024-10-29 07:36:12 -05:00
Float,
2024-05-11 15:22:08 -05:00
Eof,
2024-06-25 11:39:59 -05:00
Directive,
2024-05-11 15:22:08 -05:00
#[keywords]
Slf = "Self",
Return = "return",
If = "if",
Match = "match",
Else = "else",
Loop = "loop",
Break = "break",
Continue = "continue",
Fn = "fn",
Struct = "struct",
Packed = "packed",
Enum = "enum",
Union = "union",
True = "true",
False = "false",
Null = "null",
Idk = "idk",
Die = "die",
Defer = "defer",
Under = "_",
#[const_keywords]
CtLoop = "loop",
2024-05-11 15:22:08 -05:00
#[punkt]
2024-05-12 05:16:40 -05:00
Ctor = ".{",
2024-05-15 03:37:39 -05:00
Tupl = ".(",
2024-11-17 09:25:39 -06:00
TArrow = "=>",
2024-09-03 10:51:28 -05:00
// #define OP: each `#[prec]` delimeters a level of precedence from lowest to highest
2024-05-11 15:22:08 -05:00
#[ops]
2024-05-15 03:37:39 -05:00
#[prec]
2024-09-03 10:51:28 -05:00
// this also includess all `<op>=` tokens
2024-05-12 05:16:40 -05:00
Decl = ":=",
2024-05-11 15:22:08 -05:00
Assign = "=",
2024-05-15 03:37:39 -05:00
#[prec]
Or = "||",
#[prec]
And = "&&",
#[prec]
Bor = "|" => BorAss,
#[prec]
Xor = "^" => XorAss,
#[prec]
Band = "&" => BandAss,
#[prec]
Eq = "==",
Ne = "!=",
#[prec]
2024-05-11 15:22:08 -05:00
Le = "<=",
2024-05-13 06:36:29 -05:00
Ge = ">=",
Lt = "<",
Gt = ">",
2024-05-15 03:37:39 -05:00
#[prec]
Shl = "<<" => ShlAss,
Shr = ">>" => ShrAss,
#[prec]
Add = "+" => AddAss,
Sub = "-" => SubAss,
#[prec]
Mul = "*" => MulAss,
Div = "/" => DivAss,
Mod = "%" => ModAss,
2024-05-10 15:54:12 -05:00
}
}
2024-05-09 16:41:59 -05:00
pub struct Lexer<'a> {
2024-07-08 00:22:53 -05:00
pos: u32,
2024-10-10 01:35:17 -05:00
source: &'a [u8],
2024-05-09 16:41:59 -05:00
}
impl<'a> Lexer<'a> {
pub fn new(input: &'a str) -> Self {
2024-06-24 10:26:00 -05:00
Self::restore(input, 0)
}
2024-10-14 06:25:38 -05:00
pub fn uses(input: &'a str) -> impl Iterator<Item = &'a str> {
2024-10-12 06:07:49 -05:00
let mut s = Self::new(input);
core::iter::from_fn(move || loop {
let t = s.eat();
if t.kind == TokenKind::Eof {
return None;
}
if t.kind == TokenKind::Directive
&& s.slice(t.range()) == "use"
&& s.eat().kind == TokenKind::LParen
{
let t = s.eat();
if t.kind == TokenKind::DQuote {
return Some(&s.slice(t.range())[1..t.range().len() - 1]);
}
}
})
}
2024-06-24 10:26:00 -05:00
pub fn restore(input: &'a str, pos: u32) -> Self {
2024-10-10 01:35:17 -05:00
Self { pos, source: input.as_bytes() }
2024-05-09 16:41:59 -05:00
}
2024-09-18 02:47:52 -05:00
pub fn source(&self) -> &'a str {
2024-10-10 01:35:17 -05:00
unsafe { core::str::from_utf8_unchecked(self.source) }
2024-09-18 02:47:52 -05:00
}
2024-09-30 12:09:17 -05:00
pub fn slice(&self, tok: core::ops::Range<usize>) -> &'a str {
2024-10-10 01:35:17 -05:00
unsafe { core::str::from_utf8_unchecked(&self.source[tok]) }
2024-05-09 16:41:59 -05:00
}
2024-11-24 07:47:38 -06:00
pub fn taste(&self) -> Token {
Lexer { pos: self.pos, source: self.source }.eat()
}
2024-05-09 16:41:59 -05:00
fn peek(&self) -> Option<u8> {
2024-10-10 01:35:17 -05:00
if core::intrinsics::unlikely(self.pos >= self.source.len() as u32) {
2024-06-25 12:12:35 -05:00
None
} else {
2024-10-10 01:35:17 -05:00
Some(unsafe { *self.source.get_unchecked(self.pos as usize) })
2024-06-25 12:12:35 -05:00
}
2024-05-09 16:41:59 -05:00
}
fn advance(&mut self) -> Option<u8> {
let c = self.peek()?;
self.pos += 1;
Some(c)
}
2024-07-19 14:04:22 -05:00
pub fn last(&mut self) -> Token {
2024-10-12 06:07:49 -05:00
let mut token = self.eat();
2024-07-19 14:04:22 -05:00
loop {
2024-10-12 06:07:49 -05:00
let next = self.eat();
2024-07-19 14:04:22 -05:00
if next.kind == TokenKind::Eof {
break;
}
token = next;
}
token
}
2024-10-12 06:07:49 -05:00
pub fn eat(&mut self) -> Token {
2024-05-15 03:37:39 -05:00
use TokenKind as T;
loop {
let mut start = self.pos;
let Some(c) = self.advance() else {
2024-07-08 00:22:53 -05:00
return Token { kind: T::Eof, start, end: self.pos };
2024-05-15 03:37:39 -05:00
};
2024-06-01 13:30:07 -05:00
let advance_ident = |s: &mut Self| {
2024-06-25 11:39:59 -05:00
while let Some(b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | 127..) = s.peek() {
2024-06-01 13:30:07 -05:00
s.advance();
}
};
2024-09-30 12:09:17 -05:00
let identity = |s: u8| unsafe { core::mem::transmute::<u8, T>(s) };
2024-06-25 12:12:35 -05:00
let kind = match c {
2024-06-25 11:39:59 -05:00
..=b' ' => continue,
2024-09-01 12:42:04 -05:00
b'0' if self.advance_if(b'x') => {
while let Some(b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f') = self.peek() {
2024-09-01 12:42:04 -05:00
self.advance();
}
T::Number
}
b'0' if self.advance_if(b'b') => {
while let Some(b'0' | b'1') = self.peek() {
2024-09-01 12:42:04 -05:00
self.advance();
}
T::Number
}
2024-09-01 12:45:38 -05:00
b'0' if self.advance_if(b'o') => {
while let Some(b'0'..=b'7') = self.peek() {
2024-09-01 12:45:38 -05:00
self.advance();
}
T::Number
}
2024-05-15 03:37:39 -05:00
b'0'..=b'9' => {
while let Some(b'0'..=b'9') = self.peek() {
self.advance();
}
2024-10-29 07:36:12 -05:00
if self.advance_if(b'.') {
while let Some(b'0'..=b'9') = self.peek() {
self.advance();
}
T::Float
} else {
T::Number
}
2024-05-15 03:37:39 -05:00
}
2024-06-25 11:39:59 -05:00
b'a'..=b'z' | b'A'..=b'Z' | b'_' | 127.. => {
2024-06-01 13:30:07 -05:00
advance_ident(self);
2024-10-10 01:35:17 -05:00
let ident = &self.source[start as usize..self.pos as usize];
2024-06-01 13:30:07 -05:00
T::from_ident(ident)
2024-05-15 03:37:39 -05:00
}
2024-06-25 12:46:48 -05:00
b'"' | b'\'' => loop {
match self.advance() {
Some(b'\\') => _ = self.advance(),
Some(nc) if nc == c => break identity(c),
Some(_) => {}
None => break T::Eof,
}
},
b'/' if self.advance_if(b'/') => {
2024-10-10 01:35:17 -05:00
while let Some(l) = self.peek()
2024-06-25 12:46:48 -05:00
&& l != b'\n'
2024-10-10 01:35:17 -05:00
{
self.pos += 1;
}
let end = self.source[..self.pos as usize]
.iter()
.rposition(|&b| !b.is_ascii_whitespace())
.map_or(self.pos, |i| i as u32 + 1);
return Token { kind: T::Comment, start, end };
2024-06-25 12:46:48 -05:00
}
b'/' if self.advance_if(b'*') => {
let mut depth = 1;
while let Some(l) = self.advance() {
match l {
b'/' if self.advance_if(b'*') => depth += 1,
b'*' if self.advance_if(b'/') => match depth {
1 => break,
_ => depth -= 1,
},
2024-05-17 12:53:59 -05:00
_ => {}
}
}
2024-06-25 12:46:48 -05:00
T::Comment
2024-05-17 12:53:59 -05:00
}
2024-05-15 03:37:39 -05:00
b'.' if self.advance_if(b'{') => T::Ctor,
b'.' if self.advance_if(b'(') => T::Tupl,
2024-11-17 09:25:39 -06:00
b'=' if self.advance_if(b'>') => T::TArrow,
2024-06-25 12:22:49 -05:00
b'&' if self.advance_if(b'&') => T::And,
b'|' if self.advance_if(b'|') => T::Or,
b'$' if self.advance_if(b':') => T::Ct,
b'@' => {
2024-06-25 12:22:49 -05:00
start += 1;
advance_ident(self);
identity(c)
}
b'$' => {
start += 1;
advance_ident(self);
let ident = &self.source[start as usize..self.pos as usize];
T::from_ct_ident(ident)
}
2024-06-25 12:12:35 -05:00
b'<' | b'>' if self.advance_if(c) => {
identity(c - 5 + 128 * self.advance_if(b'=') as u8)
}
b':' | b'=' | b'!' | b'<' | b'>' | b'|' | b'+' | b'-' | b'*' | b'/' | b'%'
| b'^' | b'&'
if self.advance_if(b'=') =>
{
identity(c + 128)
}
_ => identity(c),
2024-05-15 03:37:39 -05:00
};
2024-07-08 00:22:53 -05:00
return Token { kind, start, end: self.pos };
2024-05-15 03:37:39 -05:00
}
2024-05-09 16:41:59 -05:00
}
fn advance_if(&mut self, arg: u8) -> bool {
if self.peek() == Some(arg) {
self.advance();
true
} else {
false
}
}
}
2024-05-19 11:20:42 -05:00
pub fn line_col(bytes: &[u8], pos: u32) -> (usize, usize) {
bytes[..pos as usize]
2024-05-12 16:19:45 -05:00
.split(|&b| b == b'\n')
2024-05-19 11:20:42 -05:00
.map(<[u8]>::len)
2024-05-12 16:19:45 -05:00
.enumerate()
2024-05-19 11:20:42 -05:00
.last()
.map(|(line, col)| (line + 1, col + 1))
2024-05-12 16:19:45 -05:00
.unwrap_or((1, 1))
}