From 09aacff161e0927acb0f72526f9bb3ee4cb9baae Mon Sep 17 00:00:00 2001
From: mlokr <jdoka@crownsterling.io>
Date: Wed, 31 Jan 2024 20:11:57 +0100
Subject: [PATCH] establishing some syntax

---
 Cargo.lock            |  55 +++++++
 hbbytecode/Cargo.toml |   1 +
 hbbytecode/src/lib.rs |  59 ++++++-
 hblang/Cargo.toml     |   5 +
 hblang/src/lexer.rs   | 149 ++++++++++++++++++
 hblang/src/lib.rs     |  16 +-
 hblang/src/parser.rs  | 359 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 629 insertions(+), 15 deletions(-)
 create mode 100644 hblang/src/lexer.rs
 create mode 100644 hblang/src/parser.rs

diff --git a/Cargo.lock b/Cargo.lock
index 8cc8505a..c1181a46 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -83,6 +83,12 @@ dependencies = [
  "rustc-demangle",
 ]
 
+[[package]]
+name = "beef"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1"
+
 [[package]]
 name = "bitflags"
 version = "2.4.1"
@@ -167,6 +173,12 @@ dependencies = [
  "once_cell",
 ]
 
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
 [[package]]
 name = "getrandom"
 version = "0.2.10"
@@ -197,12 +209,17 @@ dependencies = [
 name = "hbbytecode"
 version = "0.1.0"
 dependencies = [
+ "paste",
  "with_builtin_macros",
 ]
 
 [[package]]
 name = "hblang"
 version = "0.1.0"
+dependencies = [
+ "hbvm",
+ "logos",
+]
 
 [[package]]
 name = "hbvm"
@@ -254,6 +271,38 @@ version = "0.2.149"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
 
+[[package]]
+name = "logos"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c000ca4d908ff18ac99b93a062cb8958d331c3220719c52e77cb19cc6ac5d2c1"
+dependencies = [
+ "logos-derive",
+]
+
+[[package]]
+name = "logos-codegen"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc487311295e0002e452025d6b580b77bb17286de87b57138f3b5db711cded68"
+dependencies = [
+ "beef",
+ "fnv",
+ "proc-macro2",
+ "quote",
+ "regex-syntax",
+ "syn 2.0.38",
+]
+
+[[package]]
+name = "logos-derive"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dbfc0d229f1f42d790440136d941afd806bc9e949e2bcb8faa813b0f00d1267e"
+dependencies = [
+ "logos-codegen",
+]
+
 [[package]]
 name = "memchr"
 version = "2.6.4"
@@ -349,6 +398,12 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "regex-syntax"
+version = "0.6.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
+
 [[package]]
 name = "rhai"
 version = "1.16.2"
diff --git a/hbbytecode/Cargo.toml b/hbbytecode/Cargo.toml
index ecfb1dbc..4c198842 100644
--- a/hbbytecode/Cargo.toml
+++ b/hbbytecode/Cargo.toml
@@ -4,4 +4,5 @@ version = "0.1.0"
 edition = "2018"
 
 [dependencies]
+paste = "1.0.14"
 with_builtin_macros = "0.0.3"
diff --git a/hbbytecode/src/lib.rs b/hbbytecode/src/lib.rs
index 806e3f9a..6a91fbcb 100644
--- a/hbbytecode/src/lib.rs
+++ b/hbbytecode/src/lib.rs
@@ -23,6 +23,21 @@ macro_rules! define_items {
             #[repr(packed)]
             pub struct $name($(pub $item),*);
             unsafe impl BytecodeItem for $name {}
+
+            impl Encodable for $name {
+                fn encode(self, buffer: &mut impl Buffer) {
+                    let array = unsafe {
+                        core::mem::transmute::<Self, [u8; core::mem::size_of::<Self>()]>(self)
+                    };
+                    for byte in array {
+                        unsafe { buffer.write(byte) };
+                    }
+                }
+
+                fn encode_len(self) -> usize {
+                    core::mem::size_of::<Self>()
+                }
+            }
         )*
     };
 }
@@ -85,13 +100,55 @@ unsafe impl BytecodeItem for u8 {}
     }
 }
 
+pub trait Buffer {
+    fn reserve(&mut self, bytes: usize);
+    /// # Safety
+    /// Reserve needs to be called before this function, and only reserved amount can be written.
+    unsafe fn write(&mut self, byte: u8);
+}
+
+pub trait Encodable {
+    fn encode(self, buffer: &mut impl Buffer);
+    fn encode_len(self) -> usize;
+}
+
 macro_rules! gen_opcodes {
-    ($($opcode:expr, $mnemonic:ident, $_ty:ident, $doc:literal;)*) => {
+    ($($opcode:expr, $mnemonic:ident, $ty:ident, $doc:literal;)*) => {
         pub mod opcode {
             $(
                 #[doc = $doc]
                 pub const $mnemonic: u8 = $opcode;
             )*
+
+            paste::paste! {
+                #[derive(Clone, Copy, Debug)]
+                pub enum Op { $(
+                    [< $mnemonic:lower:camel >](super::[<Ops $ty>]),
+                )* }
+
+                impl crate::Encodable for Op {
+                    fn encode(self, buffer: &mut impl crate::Buffer) {
+                        match self {
+                            $(
+                                Self::[< $mnemonic:lower:camel >](op) => {
+                                    unsafe { buffer.write($opcode) };
+                                    op.encode(buffer);
+                                }
+                            )*
+                        }
+                    }
+
+                    fn encode_len(self) -> usize {
+                        match self {
+                            $(
+                                Self::[< $mnemonic:lower:camel >](op) => {
+                                    1 + crate::Encodable::encode_len(op)
+                                }
+                            )*
+                        }
+                    }
+                }
+            }
         }
     };
 }
diff --git a/hblang/Cargo.toml b/hblang/Cargo.toml
index ea320ae3..42880ab8 100644
--- a/hblang/Cargo.toml
+++ b/hblang/Cargo.toml
@@ -6,3 +6,8 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
+#hbbytecode = { version = "0.1.0", path = "../hbbytecode" }
+logos = "0.13.0"
+
+[dev-dependencies]
+hbvm = { path = "../hbvm", features = ["nightly"] }
diff --git a/hblang/src/lexer.rs b/hblang/src/lexer.rs
new file mode 100644
index 00000000..6af6ce78
--- /dev/null
+++ b/hblang/src/lexer.rs
@@ -0,0 +1,149 @@
+use logos::Logos;
+
+macro_rules! gen_token {
+    ($name:ident {
+        keywords: {
+            $($keyword:ident = $lit:literal,)*
+        },
+        operators: $op_name:ident {
+            $($prec:literal: {$(
+                $op:ident = $op_lit:literal,
+            )*},)*
+        },
+        types: $ty_type:ident {
+            $($ty:ident = $ty_lit:literal,)*
+        },
+        regexes: {
+            $($regex:ident = $regex_lit:literal,)*
+        },
+    }) => {
+        #[derive(Debug, Clone, PartialEq, Eq, Copy, Logos)]
+        #[logos(skip "[ \t\n]+")]
+        pub enum $name {
+            $(#[token($lit)] $keyword,)*
+            $($(#[token($op_lit, |_| $op_name::$op)])*)*
+            Op($op_name),
+            $(#[token($ty_lit, |_| $ty_type::$ty)])*
+            Ty($ty_type),
+            $(#[regex($regex_lit)] $regex,)*
+        }
+
+        #[derive(Debug, Clone, PartialEq, Eq, Copy)]
+        pub enum $op_name {
+            $($($op,)*)*
+        }
+
+        #[derive(Debug, Clone, PartialEq, Eq, Copy)]
+        pub enum $ty_type {
+            $($ty,)*
+        }
+
+        impl $op_name {
+            pub fn prec(&self) -> u8 {
+                match self {
+                    $($($op_name::$op => $prec,)*)*
+                }
+            }
+        }
+    };
+}
+
+gen_token! {
+    TokenKind {
+        keywords: {
+            Fn = "fn",
+            Let = "let",
+            If = "if",
+            Else = "else",
+            For = "for",
+            Return = "return",
+            Break = "break",
+            Continue = "continue",
+            Struct = "struct",
+
+            True = "true",
+            False = "false",
+
+            LBrace = "{",
+            RBrace = "}",
+            LParen = "(",
+            RParen = ")",
+            LBracket = "[",
+            RBracket = "]",
+
+            Colon = ":",
+            Semicolon = ";",
+            Comma = ",",
+            Dot = ".",
+        },
+        operators: Op {
+            14: {
+                Assign = "=",
+                AddAssign = "+=",
+                SubAssign = "-=",
+                MulAssign = "*=",
+                DivAssign = "/=",
+                ModAssign = "%=",
+                AndAssign = "&=",
+                OrAssign = "|=",
+                XorAssign = "^=",
+                ShlAssign = "<<=",
+                ShrAssign = ">>=",
+            },
+            12: {
+                Or = "||",
+            },
+            11: {
+                And = "&&",
+            },
+            10: {
+                Bor = "|",
+            },
+            9: {
+                Xor = "^",
+            },
+            8: {
+                Band = "&",
+            },
+            7: {
+                Eq = "==",
+                Neq = "!=",
+            },
+            6: {
+                Lt = "<",
+                Gt = ">",
+                Le = "<=",
+                Ge = ">=",
+            },
+            5: {
+                Shl = "<<",
+                Shr = ">>",
+            },
+            4: {
+                Add = "+",
+                Sub = "-",
+            },
+            3: {
+                Mul = "*",
+                Div = "/",
+                Mod = "%",
+            },
+        },
+        types: Ty {
+            U8 = "u8",
+            U16 = "u16",
+            U32 = "u32",
+            U64 = "u64",
+            I8 = "i8",
+            I16 = "i16",
+            I32 = "i32",
+            I64 = "i64",
+            Bool = "bool",
+            Void = "void",
+        },
+        regexes: {
+            Ident = "[a-zA-Z_][a-zA-Z0-9_]*",
+            Number = "[0-9]+",
+        },
+    }
+}
diff --git a/hblang/src/lib.rs b/hblang/src/lib.rs
index 7d12d9af..5333bc75 100644
--- a/hblang/src/lib.rs
+++ b/hblang/src/lib.rs
@@ -1,14 +1,2 @@
-pub fn add(left: usize, right: usize) -> usize {
-    left + right
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn it_works() {
-        let result = add(2, 2);
-        assert_eq!(result, 4);
-    }
-}
+mod lexer;
+mod parser;
diff --git a/hblang/src/parser.rs b/hblang/src/parser.rs
new file mode 100644
index 00000000..4629bc0a
--- /dev/null
+++ b/hblang/src/parser.rs
@@ -0,0 +1,359 @@
+use {core::panic, std::iter};
+
+use logos::{Lexer, Logos};
+
+use crate::lexer::{Op, TokenKind, Ty};
+
+pub enum Item {
+    Struct(Struct),
+    Function(Function),
+}
+
+pub enum Type {
+    Builtin(Ty),
+    Struct(String),
+}
+
+pub struct Struct {
+    pub name:   String,
+    pub fields: Vec<Field>,
+}
+
+pub struct Field {
+    pub name: String,
+    pub ty:   Type,
+}
+
+pub struct Function {
+    pub name: String,
+    pub args: Vec<Arg>,
+    pub ret:  Type,
+    pub body: Vec<Exp>,
+}
+
+pub struct Arg {
+    pub name: String,
+    pub ty:   Type,
+}
+
+pub enum Exp {
+    Literal(Literal),
+    Variable(String),
+    Call {
+        name: Box<Exp>,
+        args: Vec<Exp>,
+    },
+    Index {
+        base:  Box<Exp>,
+        index: Box<Exp>,
+    },
+    Field {
+        base:  Box<Exp>,
+        field: String,
+    },
+    Unary {
+        op:  Op,
+        exp: Box<Exp>,
+    },
+    Binary {
+        op:    Op,
+        left:  Box<Exp>,
+        right: Box<Exp>,
+    },
+    If {
+        cond:  Box<Exp>,
+        then:  Box<Exp>,
+        else_: Option<Box<Exp>>,
+    },
+    Block(Vec<Exp>),
+    Return(Box<Exp>),
+    Break,
+    Continue,
+}
+
+pub enum Literal {
+    Int(i64),
+    Bool(bool),
+}
+
+#[derive(Debug, PartialEq, Clone)]
+pub struct Token {
+    pub kind:  TokenKind,
+    pub span:  std::ops::Range<usize>,
+    pub value: String,
+}
+
+struct Parser<'a> {
+    next_token: Option<Token>,
+    lexer:      logos::Lexer<'a, TokenKind>,
+}
+
+impl<'a> Parser<'a> {
+    pub fn new(input: &'a str) -> Self {
+        let mut lexer = TokenKind::lexer(input);
+        let next_token = Self::next_token(&mut lexer);
+        Self { next_token, lexer }
+    }
+
+    pub fn next(&mut self) -> Option<Token> {
+        let token = self.next_token.clone();
+        self.next_token = Self::next_token(&mut self.lexer);
+        token
+    }
+
+    pub fn next_token(lexer: &mut Lexer<TokenKind>) -> Option<Token> {
+        lexer.next().map(|r| {
+            r.map(|e| Token {
+                kind:  e,
+                span:  lexer.span(),
+                value: lexer.slice().to_owned(),
+            })
+            .unwrap_or_else(|e| {
+                let (line, col) = Self::pos_to_line_col_low(lexer.source(), lexer.span().start);
+                panic!("Lexer error: {}:{}", line, col,)
+            })
+        })
+    }
+
+    pub fn pos_to_line_col(&self, pos: usize) -> (usize, usize) {
+        Self::pos_to_line_col_low(self.lexer.source(), pos)
+    }
+
+    pub fn pos_to_line_col_low(source: &str, pos: usize) -> (usize, usize) {
+        let line = source[..pos].lines().count();
+        let col = source[..pos].lines().last().map(|l| l.len()).unwrap_or(0);
+        (line, col)
+    }
+
+    pub fn expect(&mut self, kind: TokenKind) -> Token {
+        let token = self.next().unwrap_or_else(|| panic!("Unexpected EOF"));
+        if token.kind == kind {
+            token
+        } else {
+            let (line, col) = self.pos_to_line_col(token.span.start);
+            panic!(
+                "Expected {:?} at {}:{}, found {:?}",
+                kind, line, col, token.kind
+            )
+        }
+    }
+
+    pub fn peek(&self) -> Option<&Token> {
+        self.next_token.as_ref()
+    }
+
+    pub fn try_advance(&mut self, kind: TokenKind) -> bool {
+        if self.peek().is_some_and(|t| t.kind == kind) {
+            self.next();
+            true
+        } else {
+            false
+        }
+    }
+
+    pub fn parse(&mut self) -> Vec<Item> {
+        iter::from_fn(|| self.parse_item()).collect()
+    }
+
+    fn parse_item(&mut self) -> Option<Item> {
+        let token = self.next()?;
+        match token.kind {
+            TokenKind::Struct => Some(self.parse_struct()),
+            TokenKind::Fn => Some(self.parse_function()),
+            tkn => {
+                let (line, col) = self.pos_to_line_col(token.span.start);
+                panic!("Unexpected {:?} at {}:{}", tkn, line, col)
+            }
+        }
+    }
+
+    fn parse_struct(&mut self) -> Item {
+        let name = self.expect(TokenKind::Ident).value;
+        self.expect(TokenKind::LBrace);
+        let fields = iter::from_fn(|| self.parse_field()).collect();
+        self.expect(TokenKind::RBrace);
+        Item::Struct(Struct { name, fields })
+    }
+
+    fn parse_field(&mut self) -> Option<Field> {
+        if self.peek()?.kind == TokenKind::RBrace {
+            return None;
+        }
+
+        let name = self.expect(TokenKind::Ident).value;
+        self.expect(TokenKind::Colon);
+        let ty = self.type_();
+        self.try_advance(TokenKind::Comma);
+
+        Some(Field { name, ty })
+    }
+
+    fn type_(&mut self) -> Type {
+        let token = self.next().unwrap();
+        match token.kind {
+            TokenKind::Ty(ty) => Type::Builtin(ty),
+            TokenKind::Ident => Type::Struct(token.value),
+            tkn => {
+                let (line, col) = self.pos_to_line_col(token.span.start);
+                panic!("Unexpected {:?} at {}:{}", tkn, line, col)
+            }
+        }
+    }
+
+    fn parse_function(&mut self) -> Item {
+        let name = self.expect(TokenKind::Ident).value;
+        self.expect(TokenKind::LParen);
+        let args = iter::from_fn(|| self.parse_arg()).collect();
+        self.expect(TokenKind::RParen);
+        self.expect(TokenKind::Colon);
+        let ret = self.type_();
+        self.expect(TokenKind::LBrace);
+        let body = iter::from_fn(|| self.parse_stmt()).collect();
+        self.expect(TokenKind::RBrace);
+        Item::Function(Function {
+            name,
+            args,
+            ret,
+            body,
+        })
+    }
+
+    fn parse_arg(&mut self) -> Option<Arg> {
+        if self.peek()?.kind == TokenKind::RParen {
+            return None;
+        }
+
+        let name = self.expect(TokenKind::Ident).value;
+        self.expect(TokenKind::Colon);
+        let ty = self.type_();
+        self.try_advance(TokenKind::Comma);
+
+        Some(Arg { name, ty })
+    }
+
+    fn parse_stmt(&mut self) -> Option<Exp> {
+        if self.peek()?.kind == TokenKind::RBrace {
+            return None;
+        }
+
+        let expr = self.parse_expr();
+        self.expect(TokenKind::Semicolon);
+
+        Some(expr)
+    }
+
+    fn parse_expr(&mut self) -> Exp {
+        self.parse_binary_expr(255)
+    }
+
+    fn parse_binary_expr(&mut self, min_prec: u8) -> Exp {
+        let mut lhs = self.parse_unit_expr();
+
+        while let Some(TokenKind::Op(op)) = self.peek().map(|t| t.kind) {
+            let prec = op.prec();
+            if prec <= min_prec {
+                break;
+            }
+
+            self.next();
+            let rhs = self.parse_binary_expr(prec);
+
+            lhs = Exp::Binary {
+                op,
+                left: Box::new(lhs),
+                right: Box::new(rhs),
+            };
+        }
+
+        lhs
+    }
+
+    fn parse_unit_expr(&mut self) -> Exp {
+        let token = self.next().unwrap();
+        let mut expr = match token.kind {
+            TokenKind::True => Exp::Literal(Literal::Bool(true)),
+            TokenKind::False => Exp::Literal(Literal::Bool(false)),
+            TokenKind::Ident => Exp::Variable(token.value),
+            TokenKind::LBrace => {
+                let body = iter::from_fn(|| self.parse_stmt()).collect();
+                self.expect(TokenKind::RBrace);
+                Exp::Block(body)
+            }
+            TokenKind::LParen => {
+                let expr = self.parse_expr();
+                self.expect(TokenKind::RParen);
+                expr
+            }
+            TokenKind::Number => {
+                let value = token.value.parse().unwrap();
+                Exp::Literal(Literal::Int(value))
+            }
+            TokenKind::Fn => todo!(),
+            TokenKind::Let => todo!(),
+            TokenKind::If => todo!(),
+            TokenKind::Else => todo!(),
+            TokenKind::For => todo!(),
+            TokenKind::Return => todo!(),
+            TokenKind::Break => todo!(),
+            TokenKind::Continue => todo!(),
+            TokenKind::Struct => todo!(),
+            TokenKind::RBrace => todo!(),
+            TokenKind::RParen => todo!(),
+            TokenKind::LBracket => todo!(),
+            TokenKind::RBracket => todo!(),
+            TokenKind::Colon => todo!(),
+            TokenKind::Semicolon => todo!(),
+            TokenKind::Comma => todo!(),
+            TokenKind::Op(_) => todo!(),
+            TokenKind::Ty(_) => todo!(),
+            TokenKind::Dot => todo!(),
+        };
+
+        loop {
+            match self.peek().map(|t| t.kind) {
+                Some(TokenKind::LParen) => {
+                    self.next();
+                    let args = iter::from_fn(|| self.parse_call_arg()).collect();
+                    self.expect(TokenKind::RParen);
+                    expr = Exp::Call {
+                        name: Box::new(expr),
+                        args,
+                    };
+                }
+                Some(TokenKind::LBracket) => {
+                    self.next();
+                    let index = self.parse_expr();
+                    self.expect(TokenKind::RBracket);
+                    expr = Exp::Index {
+                        base:  Box::new(expr),
+                        index: Box::new(index),
+                    };
+                }
+                Some(TokenKind::Dot) => {
+                    self.next();
+                    let field = self.expect(TokenKind::Ident).value;
+                    expr = Exp::Field {
+                        base: Box::new(expr),
+                        field,
+                    };
+                }
+                _ => break expr,
+            }
+        }
+    }
+
+    pub fn parse_call_arg(&mut self) -> Option<Exp> {
+        if self.peek()?.kind == TokenKind::RParen {
+            return None;
+        }
+
+        let expr = self.parse_expr();
+        self.try_advance(TokenKind::Comma);
+
+        Some(expr)
+    }
+}
+
+pub fn parse(input: &str) -> Vec<Item> {
+    Parser::new(input).parse()
+}