From 91907a90ff2312526278d11d4d76bc3afbf3ff8b Mon Sep 17 00:00:00 2001 From: mlokr Date: Tue, 2 Jul 2024 14:49:05 +0200 Subject: [PATCH] -__- --- hblang/README.md | 14 ++ hblang/src/codegen.rs | 182 +++++++++++++++++++---- hblang/src/lib.rs | 1 + hblang/src/parser.rs | 9 ++ hblang/tests/codegen_tests_c_strings.txt | 0 5 files changed, 175 insertions(+), 31 deletions(-) create mode 100644 hblang/tests/codegen_tests_c_strings.txt diff --git a/hblang/README.md b/hblang/README.md index 09315f8..6afdbbb 100644 --- a/hblang/README.md +++ b/hblang/README.md @@ -252,6 +252,20 @@ main := fn(): int { - `@bitcast()`: tell compiler to assume `@TypeOf()` is whatever is inferred, so long as size and alignment did not change - `@eca(, ...)`: invoke `eca` instruction, where `` is the type this will return and `...` are arguments passed to the call +#### c_strings +```hb +main := fn(): int { + // when string ends with '\0' its a C string and thus type is '^u8' + some_str := "abāļž\n\r\t\{ff}\{fff0f0ff}\0"; + len := 0; + loop if *some_str == 0 break else { + len += 1; + some_str += 1; + } + return len; +} +``` + ### Incomplete Examples #### generic_types diff --git a/hblang/src/codegen.rs b/hblang/src/codegen.rs index 2e3a905..8ca364a 100644 --- a/hblang/src/codegen.rs +++ b/hblang/src/codegen.rs @@ -1,12 +1,13 @@ -use std::{ops::Range, rc::Rc}; - -use crate::{ - ident::{self, Ident}, - instrs::{self, *}, - lexer::TokenKind, - log, - parser::{self, find_symbol, idfl, Expr, ExprRef, FileId, Pos}, - HashMap, +use { + crate::{ + ident::{self, Ident}, + instrs::{self, *}, + lexer::TokenKind, + log, + parser::{self, find_symbol, idfl, Expr, ExprRef, FileId, Pos}, + HashMap, + }, + std::{ops::Range, rc::Rc}, }; use self::reg::{RET_ADDR, STACK_PTR, ZERO}; @@ -975,6 +976,7 @@ pub struct Snapshot { code: usize, funcs: usize, globals: usize, + strings: usize, } #[derive(Default)] @@ -982,6 +984,7 @@ struct Output { code: Vec, funcs: Vec<(ty::Func, Reloc)>, globals: Vec<(ty::Global, Reloc)>, + strings: Vec, } impl Output { @@ -1022,21 +1025,24 @@ impl Output { self.emit(tx()); } - fn append(&mut self, val: &mut Self) { - for (_, rel) in val.globals.iter_mut().chain(&mut val.funcs) { - rel.offset += self.code.len() as Offset; - } + fn reloc_iter_mut(&mut self, snap: &Snapshot) -> impl Iterator { + self.globals[snap.globals..] + .iter_mut() + .chain(&mut self.funcs[snap.funcs..]) + .map(|(_, rel)| rel) + .chain( + self.strings[snap.strings..] + .iter_mut() + .map(|rl| &mut rl.reloc), + ) + } - self.code.append(&mut val.code); - self.funcs.append(&mut val.funcs); - self.globals.append(&mut val.globals); + fn append(&mut self, val: &mut Self) { + val.pop(self, &Snapshot::default()); } fn pop(&mut self, stash: &mut Self, snap: &Snapshot) { - for (_, rel) in self.globals[snap.globals..] - .iter_mut() - .chain(&mut self.funcs[snap.funcs..]) - { + for rel in self.reloc_iter_mut(snap) { rel.offset -= snap.code as Offset; rel.offset += stash.code.len() as Offset; } @@ -1044,12 +1050,14 @@ impl Output { stash.code.extend(self.code.drain(snap.code..)); stash.funcs.extend(self.funcs.drain(snap.funcs..)); stash.globals.extend(self.globals.drain(snap.globals..)); + stash.strings.extend(self.strings.drain(snap.strings..)); } fn trunc(&mut self, snap: &Snapshot) { self.code.truncate(snap.code); self.globals.truncate(snap.globals); self.funcs.truncate(snap.funcs); + self.strings.truncate(snap.strings); } fn write_trap(&mut self, trap: Trap) { @@ -1063,6 +1071,7 @@ impl Output { code: self.code.len(), funcs: self.funcs.len(), globals: self.globals.len(), + strings: self.strings.len(), } } @@ -1209,10 +1218,21 @@ enum Trap { }, } +struct StringReloc { + reloc: Reloc, + range: std::ops::Range, +} +impl StringReloc { + fn range(&self) -> std::ops::Range { + self.range.start as _..self.range.end as _ + } +} + #[derive(Default)] pub struct Codegen { - pub files: Vec, - tasks: Vec>, + pub files: Vec, + tasks: Vec>, + string_data: Vec, tys: Types, ci: ItemCtx, @@ -1436,6 +1456,73 @@ impl Codegen { ty: ty::BOOL.into(), loc: Loc::imm(value as u64), }), + E::String { pos, mut literal } => { + literal = literal.trim_matches('"'); + + if !literal.ends_with("\\0") { + self.report(pos, "string literal must end with null byte (for now)"); + } + + let reloc = Reloc::new(self.output.code.len() as _, 3, 4); + let start = self.string_data.len(); + + let report = |s: &Codegen, bytes: &std::str::Bytes, message| { + s.report(pos + (literal.len() - bytes.len()) as u32 - 1, message) + }; + + let decode_braces = |s: &mut Codegen, bytes: &mut std::str::Bytes| { + while let Some(b) = bytes.next() + && b != b'}' + { + let c = bytes + .next() + .unwrap_or_else(|| report(s, bytes, "incomplete escape sequence")); + let decode = |s: &Codegen, b: u8| match b { + b'0'..=b'9' => b - b'0', + b'a'..=b'f' => b - b'a' + 10, + b'A'..=b'F' => b - b'A' + 10, + _ => report(s, bytes, "expected hex digit or '}'"), + }; + s.string_data.push(decode(s, b) << 4 | decode(s, c)); + } + }; + + let mut bytes = literal.bytes(); + while let Some(b) = bytes.next() { + if b != b'\\' { + self.string_data.push(b); + continue; + } + let b = match bytes + .next() + .unwrap_or_else(|| report(self, &bytes, "incomplete escape sequence")) + { + b'n' => b'\n', + b'r' => b'\r', + b't' => b'\t', + b'\\' => b'\\', + b'\'' => b'\'', + b'"' => b'"', + b'0' => b'\0', + b'{' => { + decode_braces(self, &mut bytes); + continue; + } + _ => report( + self, + &bytes, + "unknown escape sequence, expected [nrt\\\"'{0]", + ), + }; + self.string_data.push(b); + } + + let range = start as _..self.string_data.len() as _; + self.output.strings.push(StringReloc { reloc, range }); + let reg = self.ci.regs.allocate(); + self.output.emit(instrs::lra(reg.get(), 0, 0)); + Some(Value::new(self.tys.make_ptr(ty::U8.into()), reg)) + } E::Ctor { pos, ty, fields, .. } => { @@ -1699,20 +1786,14 @@ impl Codegen { .find(|(_, v)| v.id == id) => { let sym = parser::find_symbol(&self.files[self.ci.file as usize].symbols, id); - let loc = match idfl::index(sym.flags) == dbg!(index) + let loc = match idfl::index(sym.flags) == index && !self .ci .loops .last() .is_some_and(|l| l.var_count > var_index as u32) { - true => { - dbg!( - log::dbg!("braj: {expr}"), - std::mem::take(&mut var.value.loc) - ) - .1 - } + true => std::mem::take(&mut var.value.loc), false => var.value.loc.as_ref(), }; @@ -2489,7 +2570,45 @@ impl Codegen { _ = task::unpack(self.tys.globals[g as usize].offset) .map(|off| rel.apply_jump(&mut self.output.code, off)); true - }) + }); + + self.compress_strings(); + let base = self.output.code.len() as u32; + self.output.code.append(&mut self.string_data); + + for srel in self.output.strings.drain(..) { + srel.reloc + .apply_jump(&mut self.output.code, srel.range.start + base); + } + } + + fn compress_strings(&mut self) { + // FIXME: we can go faster + self.output + .strings + .sort_by(|a, b| self.string_data[b.range()].cmp(&self.string_data[a.range()])); + + let mut cursor = 0; + let mut anchor = 0; + for i in 1..self.output.strings.len() { + let [a, b] = self.output.strings.get_many_mut([anchor, i]).unwrap(); + if self.string_data[a.range()].ends_with(&self.string_data[b.range()]) { + b.range.end = a.range.end; + b.range.start = a.range.end - (b.range.end - b.range.start); + } else { + self.string_data.copy_within(a.range(), cursor); + cursor += a.range.len(); + anchor = i; + } + } + + if !self.output.strings.is_empty() { + let a = &self.output.strings[anchor]; + self.string_data.copy_within(a.range(), cursor); + cursor += a.range.len(); + } + + self.string_data.truncate(cursor) } // TODO: sometimes its better to do this in bulk @@ -2906,5 +3025,6 @@ mod tests { global_variables => README; generic_types => README; generic_functions => README; + c_strings => README; } } diff --git a/hblang/src/lib.rs b/hblang/src/lib.rs index 901ea81..44ed514 100644 --- a/hblang/src/lib.rs +++ b/hblang/src/lib.rs @@ -1,4 +1,5 @@ #![feature(vec_pop_if)] +#![feature(get_many_mut)] #![feature(core_intrinsics)] #![feature(new_uninit)] #![feature(anonymous_lifetime_in_impl_trait)] diff --git a/hblang/src/parser.rs b/hblang/src/parser.rs index 4159064..aaa0ed9 100644 --- a/hblang/src/parser.rs +++ b/hblang/src/parser.rs @@ -264,6 +264,10 @@ impl<'a, 'b> Parser<'a, 'b> { pos: token.start, value: true, }, + T::DQuote => E::String { + pos: token.start, + literal: self.move_str(token), + }, T::Struct => E::Struct { fields: { self.ns_bound = self.idents.len(); @@ -586,6 +590,10 @@ macro_rules! generate_expr { generate_expr! { #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Expr<'a> { + String { + pos: Pos, + literal: &'a str, + }, Comment { pos: Pos, literal: &'a str, @@ -791,6 +799,7 @@ impl<'a> std::fmt::Display for Expr<'a> { } match *self { + Self::String { literal, .. } => write!(f, "{}", literal), Self::Comment { literal, .. } => write!(f, "{}", literal.trim_end()), Self::Mod { path, .. } => write!(f, "@mod(\"{path}\")"), Self::Field { target, field } => write!(f, "{}.{field}", Postfix(target)), diff --git a/hblang/tests/codegen_tests_c_strings.txt b/hblang/tests/codegen_tests_c_strings.txt new file mode 100644 index 0000000..e69de29