feat: lexer & overall rework

replace/7746dba3cc6b3860afe1faf69e86ed84ee46988d
Natapat Samutpong 2022-02-12 13:28:53 +07:00
parent 4f97a39f86
commit 24daf588b0
15 changed files with 241 additions and 477 deletions

32
Cargo.lock generated
View File

@ -76,6 +76,14 @@ dependencies = [
"libc",
]
[[package]]
name = "hycron"
version = "0.1.0"
dependencies = [
"clap",
"nom",
]
[[package]]
name = "indexmap"
version = "1.8.0"
@ -104,6 +112,23 @@ version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a"
[[package]]
name = "minimal-lexical"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
[[package]]
name = "nom"
version = "7.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b1d11e1ef389c76fe5b81bcaf2ea32cf88b62bc494e19f493d0b30e7a930109"
dependencies = [
"memchr",
"minimal-lexical",
"version_check",
]
[[package]]
name = "os_str_bytes"
version = "6.0.0"
@ -199,13 +224,6 @@ version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "vyc"
version = "0.1.0"
dependencies = [
"clap",
]
[[package]]
name = "winapi"
version = "0.3.9"

View File

@ -1,5 +1,5 @@
[package]
name = "vyc"
name = "hycron"
version = "0.1.0"
edition = "2021"
@ -7,3 +7,4 @@ edition = "2021"
[dependencies]
clap = { version = "3.0.14", features = ["derive"] }
nom = "7.1.0"

View File

@ -1 +0,0 @@
(def number 1)

3
example/hello_world.hyc Normal file
View File

@ -0,0 +1,3 @@
let msg :: String = "Hello, World";
func add2 :: (a: Int, b: Int) -> Int = a + b;
func main :: () = puts (msg);

View File

@ -1,2 +0,0 @@
(def message "Hello, World")
(print message)

View File

@ -1 +0,0 @@
(if (equal 1 1) (print "a"))

View File

@ -3,7 +3,7 @@ use clap::{ Parser, Subcommand };
const VERSION: &str = env!("CARGO_PKG_VERSION");
/// Vy language compiler.
/// Hycron language compiler.
#[derive(Parser, Debug)]
#[clap(
version = VERSION,

31
src/front/helper.rs Normal file
View File

@ -0,0 +1,31 @@
use std::str::{self, Utf8Error, FromStr};
pub type Bytes = [u8];
#[macro_export]
macro_rules! syntax {
($func_name: ident, $tag_string: literal, $output_token: expr) => {
fn $func_name<'a>(s: &'a Bytes) -> IResult<&Bytes, Token> {
map(tag($tag_string), |_| $output_token)(s)
}
};
}
pub fn concat_slice_vec(a: &Bytes, b: Vec<u8>) -> Vec<u8> {
let mut result = a.to_vec();
result.extend(&b);
result
}
pub fn convert_vec_utf8(v: Vec<u8>) -> Result<String, Utf8Error> {
let slice = v.as_slice();
str::from_utf8(slice).map(|s| s.to_owned())
}
pub fn str_from_bytes(c: &Bytes) -> Result<&str, Utf8Error> {
str::from_utf8(c)
}
pub fn str_to_from_str<F: FromStr>(c: &str) -> Result<F, F::Err> {
FromStr::from_str(c)
}

137
src/front/lex.rs Normal file
View File

@ -0,0 +1,137 @@
use nom::{
branch::alt,
bytes::complete::{tag, take},
character::complete::{multispace0, alphanumeric1, alpha1, digit1},
combinator::{map, map_res, recognize},
IResult,
multi::many0,
sequence::{delimited, pair}, AsBytes,
};
use crate::syntax;
use super::{
model::Token,
helper::{Bytes, convert_vec_utf8, concat_slice_vec, str_from_bytes, str_to_from_str},
};
// Comparison
syntax! { equal_operator , "==", Token::Eq }
syntax! { not_equal_operator , "!=", Token::NEq }
syntax! { less_than_operator , "<" , Token::Lt }
syntax! { greater_than_operator , ">" , Token::Gt }
syntax! { less_than_equal_operator , "<=", Token::Lte }
syntax! { greater_than_equal_operator , ">=", Token::Gte }
// Arithmetic
syntax! { assign_operator , "=", Token::Assign }
syntax! { add_operator , "+", Token::Plus }
syntax! { subtract_operator , "-", Token::Minus }
syntax! { multiply_operator , "*", Token::Mul }
syntax! { divide_operator , "/", Token::Div }
syntax! { not_operator , "!", Token::Not }
// Punctuations
syntax! { typehint_punctuation , "::", Token::Typehint }
syntax! { lparen_punctuation , "(", Token::LParen }
syntax! { rparen_punctuation , ")", Token::RParen }
syntax! { semicolon_punctuation , ";", Token::Semicolon }
syntax! { colon_punctuation , ":", Token::Colon }
// Operator & Punctuation
fn lex_operator_punctuation(input: &Bytes) -> IResult<&Bytes, Token> {
alt((
equal_operator, not_equal_operator,
less_than_operator, greater_than_operator,
less_than_equal_operator, greater_than_equal_operator,
assign_operator,
add_operator, subtract_operator, multiply_operator, divide_operator,
not_operator,
typehint_punctuation,
lparen_punctuation, rparen_punctuation,
semicolon_punctuation, colon_punctuation,
))(input)
}
// String
fn string_value(input: &Bytes) -> IResult<&Bytes, Vec<u8>> {
let (i1, c1) = take(1usize)(input)?;
match c1.as_bytes() {
b"\"" => Ok((input, vec![])),
b"\\" => {
let (i2, c2) = take(1usize)(i1)?;
string_value(i2).map(|(slice, done)| (slice, concat_slice_vec(c2, done)))
}
c => string_value(i1).map(|(slice, done)| (slice, concat_slice_vec(c, done)))
}
}
fn string(input: &Bytes) -> IResult<&Bytes, String> {
delimited(tag("\""), map_res(string_value, convert_vec_utf8), tag("\""))(input)
}
fn lex_string(input: &Bytes) -> IResult<&Bytes, Token> {
map(string, |s| Token::String(s))(input)
}
// Reserved keywords & Identifiers
fn lex_reserved_identifier(input: &Bytes) -> IResult<&Bytes, Token> {
map_res(
recognize(pair(
alt((alpha1, tag("_"))
),
many0(alt((alphanumeric1, tag("_")))),
)),
|s| {
let c = str_from_bytes(s);
c.map(|syntax| match syntax {
"if" => Token::If,
"else" => Token::Else,
"let" => Token::Let,
"func" => Token::Func,
"true" => Token::Bool(true),
"false" => Token::Bool(false),
_ => Token::Identifier(syntax.to_string()),
})
},
)(input)
}
// Integers
fn lex_integer(input: &Bytes) -> IResult<&Bytes, Token> {
map(
map_res(
map_res(digit1, str_from_bytes),
str_to_from_str,
),
Token::Int,
)(input)
}
// Illegal tokens
fn lex_illegal(input: &Bytes) -> IResult<&Bytes, Token> {
map(take(1usize), |_| Token::Illegal)(input)
}
// Tokens
fn lex_token(input: &Bytes) -> IResult<&Bytes, Token> {
alt((
lex_operator_punctuation,
lex_string,
lex_reserved_identifier,
lex_integer,
lex_illegal,
))(input)
}
fn lex_tokens(input: &Bytes) -> IResult<&Bytes, Vec<Token>> {
many0(delimited(multispace0, lex_token, multispace0))(input)
}
pub struct Lexer;
impl Lexer {
pub fn lex_tokens(input: &Bytes) -> IResult<&Bytes, Vec<Token>> {
lex_tokens(input).map(|(slice, result)| (slice, [&result[..], &vec![Token::EndOfFile][..]].concat()))
}
}

View File

@ -1 +1,4 @@
pub mod parser;
pub mod model;
pub mod helper;
pub mod lex;

29
src/front/model.rs Normal file
View File

@ -0,0 +1,29 @@
#[derive(Clone, Debug, PartialEq)]
pub enum Token {
Illegal, EndOfFile,
Identifier(String), String(String),
Int(i64), Bool(bool),
Assign, Typehint,
Plus, Minus, Mul, Div, Not,
Eq, NEq, Lt, Gt, Lte, Gte,
LParen, RParen, Semicolon, Colon,
If, Else, Let, Func,
}
/// Token struct with position information.
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct Tokens<'a> {
pub tokens: &'a [Token],
pub start: usize, pub end: usize,
}
impl<'a> Tokens<'a> {
pub fn new(tokens: &'a [Token]) -> Self {
Tokens { tokens, start: 0, end: tokens.len(), }
}
}

View File

@ -1,314 +0,0 @@
// This implementation of parser is heavily inspired by
// brundonsmith/rust_lisp (https://github.com/brundonsmith/rust_lisp)
// go check them out!
use std::{ rc::Rc, fmt };
#[derive(Debug, Clone)]
pub enum Value {
// Boolean types
True, False,
// Numbers
Int(i64), Float(f64),
String(String), Symbol(String),
List(Rc<Value>, Rc<Vec<Value>>),
Nil,
}
impl fmt::Display for Value {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Value::True => write!(f, "$True"),
Value::False => write!(f, "$False"),
Value::Int(i) => write!(f, "${}", i),
Value::Float(fl) => write!(f, "${}", fl),
Value::String(s) => write!(f, "$\"{}\"", s),
Value::Symbol(s) => write!(f, "${}", s),
Value::List(car, cdr) => {
write!(f, "(")?;
write!(f, "{}", car)?;
for item in cdr.iter().cloned() { write!(f, " {}", item)?; }
write!(f, ")")
},
Value::Nil => write!(f, "nil"),
}
}
}
#[derive(Debug, Clone)]
pub enum Tree {
Atom { atom: Value, quote: bool },
List { vec: Vec<Tree>, quote: bool },
}
impl Tree {
fn into_expr(self) -> Value {
match self {
Tree::Atom { atom, quote } => {
if quote {
Value::List(
Rc::new(Value::Symbol(String::from("quote"))),
Rc::new(vec![atom])
)
} else {
atom
}
},
Tree::List { vec, quote } => {
let list = Value::List(
Rc::new(vec[0].clone().into_expr()),
Rc::new(vec[1..].iter().map(|a| a.clone().into_expr()).collect())
);
if quote {
Value::List(
Rc::new(Value::Symbol(String::from("quote"))),
Rc::new(vec![list])
)
} else {
list
}
}
}
}
}
// --- Start tokenizer ---
const SPECIAL_CHARS: [&str; 4] = ["(", ")", "'", "..."];
/// Match the characters from `with` with the characters from `from`
/// Example: match_front("123", "12") -> true
fn match_front(from: &str, with: &str) -> bool { with.chars().zip(from.chars()).all(|(a, b)| a == b) }
/// Get length from `from` until `predicate`
/// Example: match_pred("abcdef", |c| c != 'f') -> Some(5)
fn match_predicate<F: Fn(char) -> bool>(from: &str, predicate: F) -> Option<usize> {
from.char_indices().take_while(|(_, c)| predicate(*c)).last().map(|(i, _)| i)
}
/// Check if characters is in a special characters list or not
fn is_symbolic(char: char) -> bool {
!char.is_whitespace() && !SPECIAL_CHARS.iter().any(|t| t.chars().any(|other| other == char))
}
/// Return type: (token, (start, end))
pub fn tokenize(src: &str) -> impl Iterator<Item = (&str, (usize, usize))> {
let mut skip: Option<usize> = None;
src.char_indices().filter_map(move |(i, char)| {
if skip.map(|dest| dest > i).unwrap_or(false) { return None; }
else { skip = None; }
// Whitespaces
if char.is_whitespace() { return None; }
// Special characters
for special in &SPECIAL_CHARS {
if match_front(&src[i..], special) {
skip = Some(i + special.len());
return Some((*special, (i, i + special.len())));
}
}
// Strings
if char == '"' {
let match_end = match_predicate(&src[i + 1..], |c| c != '"');
if let Some(end) = match_end {
let string_end = i + end + 3;
skip = Some(string_end);
return Some((&src[i..string_end], (i, string_end)));
}
}
// Comments
// Check if the current char is a semicolon and
if char == ';' && src[i + 1..].chars().next().map_or(false, |c| c == ';') {
// Get length until end of line
let end = i + 2 + match_predicate(&src[i + 2..], |c| c!= '\n').unwrap_or(0);
skip = Some(end + 1);
return None;
}
// Numbers
if char.is_numeric() {
let front = i + match_predicate(&src[i..], |c| c.is_numeric()).unwrap() + 1;
// Check if its a float (by checking if its contain a dot)
if front < src.len() - 1 && &src[front..front + 1] == "." {
let back = front + match_predicate(&src[front + 1..], |c| c.is_numeric()).unwrap() + 2;
skip = Some(back);
return Some((&src[i..back], (i, back)));
} else {
skip = Some(front);
return Some((&src[i..front], (i, front)));
}
}
// Symbols
if !char.is_numeric() && is_symbolic(char) {
let end = match_predicate(&src[i..], is_symbolic);
if let Some(last) = end {
let symbol_end = i + last + 1;
skip = Some(symbol_end);
return Some((&src[i..symbol_end], (i, symbol_end)));
}
}
None
})
}
// --- End tokenizer & Start parser ---
#[derive(Debug)]
pub enum ParseErrorKind {
UnexpectedParenClose,
}
impl fmt::Display for ParseErrorKind {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
ParseErrorKind::UnexpectedParenClose => write!(f, "Unexpected ')'"),
}
}
}
#[derive(Debug)]
pub struct ParseError {
pub kind: ParseErrorKind,
pub pos: (usize, usize),
}
impl ParseError {
fn new(kind: ParseErrorKind, pos: (usize, usize)) -> Self {
ParseError { kind, pos }
}
pub fn at(&self, src: &str) -> String {
let snip = &src[(self.pos.0.saturating_sub(5))..(if self.pos.0 + 5 > src.len() { src.len() } else { self.pos.0 + 5 })];
format!("\n{}..{}\n{}\nError: {} at {}", " ".repeat(3), snip, format!("{}^", " ".repeat(10)), self.kind, self.pos.0)
// Example:
//
// .."))) ) (pr
// ^
// Error: Unexpected ')' at 67
}
}
fn read<'a>(
tokens: impl Iterator<Item = (&'a str, (usize, usize))> + 'a
) -> impl Iterator<Item = Result<(Value, (usize, usize)), ParseError>> + 'a {
let mut stack: Vec<Tree> = Vec::new();
let mut parenths = 0;
let mut quote_next = false;
let mut block_start = 0;
tokens.filter_map(move |(token, (start, end))| {
match token {
"(" => {
parenths += 1;
if parenths == 1 {
block_start = start;
}
stack.push(Tree::List {
vec: Vec::new(),
quote: quote_next,
});
quote_next = false;
None
},
")" => {
parenths -= 1;
if stack.is_empty() {
Some(Err(ParseError::new(
ParseErrorKind::UnexpectedParenClose,
(start, end)
)))
} else {
let mut finished = stack.pop().unwrap();
if parenths == 0 {
stack = Vec::new();
let r = Some(Ok((finished.into_expr(), (block_start, end))));
block_start = 0;
r
} else {
let destination = stack.last_mut().unwrap();
if let Tree::List { vec, quote } = &finished {
if vec.is_empty() {
finished = Tree::Atom {
atom: Value::Nil,
quote: *quote,
};
}
}
if let Tree::List { vec, quote: _ } = destination { vec.push(finished); }
None
}
}
},
"'" => { quote_next = true; None },
_ => {
let expr = Tree::Atom {
atom: read_atom(token),
quote: quote_next,
};
quote_next = false;
if let Some(last) = stack.last_mut() {
if let Tree::List { vec, quote: _ } = last {
vec.push(expr);
}
None
} else {
Some(Ok((expr.into_expr(), (start, end))))
}
}
}
})
}
fn read_atom(token: &str) -> Value {
let lower = token.to_lowercase();
match lower.as_str() {
"true" => Value::True,
"false" => Value::False,
"nil" => Value::Nil,
_ => {
// Parse number
if let Ok(int) = token.parse::<i64>() { Value::Int(int) }
// Parse float
else if let Ok(float) = token.parse::<f64>() { Value::Float(float) }
// Parse string
else if token.chars().next().map_or(false, |c| c == '"') && token.chars().nth_back(0).map_or(false, |c| c == '"') {
Value::String(String::from(&token[1..token.chars().count() - 1]))
} else {
Value::Symbol(String::from(token))
}
}
}
}
// --- End parser ---
pub fn parse(src: &str) -> impl Iterator<Item = Result<(Value, (usize, usize)), ParseError>> + '_ {
read(tokenize(src))
}

View File

@ -1,44 +1,21 @@
use std::{fs::{ read_to_string, File }, io::Write};
use clap::Parser;
use std::fs;
use clap::Parser as ArgParser;
/// Arguments handler.
pub mod args;
use args::{ Args, Options };
use args::{Args, Options};
/// A front-end for the compiler.
/// Contains parser and tokenizer.
/// TODO: Semantic analysis and Type checking.
pub mod front;
use front::parser::parse;
/// A middle-end for the compiler.
/// Contains high intermediate representation (HIR).
pub mod middle;
use crate::middle::hir::to_hirs;
use front::lex::Lexer;
fn main() {
let args = Args::parse();
match args.options {
Options::Compile { input, ast } => {
let code = read_to_string(&input).unwrap();
let tree = parse(&code);
match ast {
true => for node in tree { println!("{:#?}", node) },
false => {
// Check if the tree is valid
let mut checked_tree = Vec::new();
for node in tree {
match node {
Ok(node) => checked_tree.push(node.0),
Err(err) => println!("{:?}", err),
}
};
// Convert the tree to HIR
let hir = to_hirs(&checked_tree);
println!("{:#?}", hir);
},
}
Options::Compile { input: src, ast: _print_ast } => {
let bytes: Vec<u8> = fs::read(src).unwrap();
let tokens = Lexer::lex_tokens(&bytes);
println!("{:?}", tokens);
},
}
}

View File

@ -1,116 +0,0 @@
use std::{rc::Rc, borrow::Borrow};
use crate::front::parser::Value;
#[derive(Debug, Clone)]
pub enum HIRLiteral {
True, False, Nil,
Int(i64), Float(f64),
String(String), Symbol(String),
List(Box<HIRLiteral>, Vec<HIRLiteral>),
}
#[derive(Debug, Clone)]
pub enum HIR {
Declare { name: String, value: HIRLiteral },
Set { name: String, value: HIRLiteral },
Let { bindings: Vec<(String, HIR)>, body: Vec<HIR> },
If { condition: Box<HIR>, then: Box<HIR>, else_: Option<Box<HIR>> },
Call { func: String, args: Rc<Vec<HIR>> },
Quoted { body: HIRLiteral },
Literal(HIRLiteral),
}
pub fn to_hirs(ast: &Vec<Value>) -> Vec<HIR> {
let mut hir = Vec::new();
for node in ast {
match node {
Value::List(car, cdr) => {
match &*car.borrow() {
Value::Symbol(ref function) => {
match function.as_str() {
"quote" => {
hir.push(HIR::Quoted { body: to_hir_literal(&cdr[0].clone()) });
},
"if" => {
let cond = to_hir_single(&cdr[0].clone());
let then = to_hir_single(&cdr[1].clone());
let else_ = if cdr.len() > 2 { Some(Box::new(to_hir_single(&cdr[2].clone()))) }
else { None };
hir.push(HIR::If { condition: Box::new(cond), then: Box::new(then), else_ });
}
"def" => {
let name: String = match &cdr[0].clone() {
Value::Symbol(name) => name.clone(),
_ => panic!("def expects a symbol as first argument"),
};
let value = &cdr[1].clone();
hir.push(HIR::Declare { name, value: to_hir_literal(value) });
},
"print" => {
let value = &cdr[0].clone();
hir.push(HIR::Call { func: "print".to_string(), args: Rc::new(vec![to_hir_single(value)]) });
},
"equal" => {
let left = &cdr[0].clone();
let right = &cdr[1].clone();
hir.push(HIR::Call { func: "equal".to_string(), args: Rc::new(vec![to_hir_single(left), to_hir_single(right)]) });
},
_ => {
dbg!(function);
todo!();
}
} // --- End match `function` ---
},
_ => {
dbg!(car);
todo!();
} // --- End match `car` ---
}
},
_ => hir.push(to_hir_single(node)),
} // --- End match `node` ---
}
hir
}
fn to_hir_single(value: &Value) -> HIR {
match value {
Value::List(car, cdr) => {
let mut vec: Vec<Value> = Vec::new();
let list: Value = Value::List(car.clone(), cdr.clone());
vec.push(list);
let result = to_hirs(&vec)[0].clone();
result
},
_ => HIR::Literal(to_hir_literal(value)),
}
}
fn to_hir_literal(value: &Value) -> HIRLiteral {
match value {
Value::True => HIRLiteral::True,
Value::False => HIRLiteral::False,
Value::Int(i) => HIRLiteral::Int(*i),
Value::Float(fl) => HIRLiteral::Float(*fl),
Value::String(s) => HIRLiteral::String(s.clone()),
Value::Symbol(s) => HIRLiteral::Symbol(s.clone()),
Value::List(car, cdr) => {
let car_literal = to_hir_literal(&car);
let cdr_literal = cdr.iter().map(|a| to_hir_literal(a)).collect::<Vec<HIRLiteral>>();
HIRLiteral::List(Box::new(car_literal), cdr_literal)
},
Value::Nil => HIRLiteral::Nil,
}
}

View File

@ -1 +0,0 @@
pub mod hir;