From 820c3e459bbc00547d912c5a1e566b0bdf5f81a5 Mon Sep 17 00:00:00 2001
From: koniifer <aurlex1@gmail.com>
Date: Mon, 14 Oct 2024 01:31:23 +0100
Subject: [PATCH] optimisations

---
 .../holeybytes/kernel_services/mem_serve.rs   | 117 ++++++++++++++++--
 known_bugs.md                                 |   2 +
 sysdata/libraries/render/src/software.hb      |  52 +++++---
 3 files changed, 142 insertions(+), 29 deletions(-)
 create mode 100644 known_bugs.md

diff --git a/kernel/src/holeybytes/kernel_services/mem_serve.rs b/kernel/src/holeybytes/kernel_services/mem_serve.rs
index e99a1aa..2c01f0d 100644
--- a/kernel/src/holeybytes/kernel_services/mem_serve.rs
+++ b/kernel/src/holeybytes/kernel_services/mem_serve.rs
@@ -24,6 +24,7 @@ fn alloc_page(vm: &mut Vm, _mem_addr: u64, _length: usize) -> Result<(), MemoryS
     Ok(())
 }
 
+#[inline(always)]
 pub fn memory_msg_handler(
     vm: &mut Vm,
     mem_addr: u64,
@@ -82,32 +83,124 @@ pub fn memory_msg_handler(
             let page_count = msg_vec[1];
             log::debug!(" {} pages", page_count);
         }
-        // memcpy
+        // trash but fast memcpy
         4 => unsafe {
             let count = u64::from_le_bytes(msg_vec[1..9].try_into().unwrap_unchecked()) as usize;
             let src = u64::from_le_bytes(msg_vec[9..17].try_into().unwrap_unchecked()) as *const u8;
             let dest = u64::from_le_bytes(msg_vec[17..25].try_into().unwrap_unchecked()) as *mut u8;
-            src.copy_to(dest, count);
+
+            let mut src_ptr = src;
+            let mut dest_ptr = dest;
+            let mut remaining = count;
+
+            while (dest_ptr as usize) & 7 != 0 && remaining > 0 {
+                *dest_ptr = *src_ptr;
+                src_ptr = src_ptr.add(1);
+                dest_ptr = dest_ptr.add(1);
+                remaining -= 1;
+            }
+
+            let mut src_ptr_64 = src_ptr as *const u64;
+            let mut dest_ptr_64 = dest_ptr as *mut u64;
+            while remaining >= 64 {
+                let (s1, s2, s3, s4, s5, s6, s7, s8) = (
+                    *src_ptr_64,
+                    *src_ptr_64.add(1),
+                    *src_ptr_64.add(2),
+                    *src_ptr_64.add(3),
+                    *src_ptr_64.add(4),
+                    *src_ptr_64.add(5),
+                    *src_ptr_64.add(6),
+                    *src_ptr_64.add(7),
+                );
+                *dest_ptr_64 = s1;
+                *dest_ptr_64.add(1) = s2;
+                *dest_ptr_64.add(2) = s3;
+                *dest_ptr_64.add(3) = s4;
+                *dest_ptr_64.add(4) = s5;
+                *dest_ptr_64.add(5) = s6;
+                *dest_ptr_64.add(6) = s7;
+                *dest_ptr_64.add(7) = s8;
+                src_ptr_64 = src_ptr_64.add(8);
+                dest_ptr_64 = dest_ptr_64.add(8);
+                remaining -= 64;
+            }
+
+            while remaining >= 8 {
+                *dest_ptr_64 = *src_ptr_64;
+                src_ptr_64 = src_ptr_64.add(1);
+                dest_ptr_64 = dest_ptr_64.add(1);
+                remaining -= 8;
+            }
+
+            src_ptr = src_ptr_64 as *const u8;
+            dest_ptr = dest_ptr_64 as *mut u8;
+            for _ in 0..remaining {
+                *dest_ptr = *src_ptr;
+                src_ptr = src_ptr.add(1);
+                dest_ptr = dest_ptr.add(1);
+            }
         },
-        // memset
+
+        // trash but fast memset
         5 => unsafe {
             let count = u64::from_le_bytes(msg_vec[1..9].try_into().unwrap_unchecked()) as usize;
             let size = u64::from_le_bytes(msg_vec[9..17].try_into().unwrap_unchecked()) as usize;
             let dest = u64::from_le_bytes(msg_vec[17..25].try_into().unwrap_unchecked()) as *mut u8;
-            let src = u64::from_le_bytes(msg_vec[25..33].try_into().unwrap_unchecked()) as *mut u8;
+            let src =
+                u64::from_le_bytes(msg_vec[25..33].try_into().unwrap_unchecked()) as *const u8;
 
             let total_size = count * size;
 
             if total_size > 32 {
-                core::ptr::copy(src, dest, size);
-                let pattern = core::slice::from_raw_parts(dest, size);
-                let mut offset = size;
+                let mut pattern_512 = [0u8; 64];
+                for i in 0..64 {
+                    pattern_512[i] = *src.add(i % size);
+                }
+                let pattern_512_ptr = pattern_512.as_ptr() as *const u64;
 
-                while offset < total_size {
-                    let remaining = total_size - offset;
-                    let copy_size = remaining.min(offset);
-                    core::ptr::copy_nonoverlapping(pattern.as_ptr(), dest.add(offset), copy_size);
-                    offset += copy_size;
+                let mut dest_ptr = dest;
+                let mut remaining = total_size;
+
+                while (dest_ptr as usize) & 7 != 0 && remaining > 0 {
+                    *dest_ptr = *src;
+                    dest_ptr = dest_ptr.add(1);
+                    remaining -= 1;
+                }
+
+                let mut dest_ptr_64 = dest_ptr as *mut u64;
+                while remaining >= 64 {
+                    let (p1, p2, p3, p4, p5, p6, p7, p8) = (
+                        *pattern_512_ptr,
+                        *pattern_512_ptr.add(1),
+                        *pattern_512_ptr.add(2),
+                        *pattern_512_ptr.add(3),
+                        *pattern_512_ptr.add(4),
+                        *pattern_512_ptr.add(5),
+                        *pattern_512_ptr.add(6),
+                        *pattern_512_ptr.add(7),
+                    );
+                    *dest_ptr_64 = p1;
+                    *dest_ptr_64.add(1) = p2;
+                    *dest_ptr_64.add(2) = p3;
+                    *dest_ptr_64.add(3) = p4;
+                    *dest_ptr_64.add(4) = p5;
+                    *dest_ptr_64.add(5) = p6;
+                    *dest_ptr_64.add(6) = p7;
+                    *dest_ptr_64.add(7) = p8;
+                    dest_ptr_64 = dest_ptr_64.add(8);
+                    remaining -= 64;
+                }
+
+                while remaining >= 8 {
+                    *dest_ptr_64 = *pattern_512_ptr;
+                    dest_ptr_64 = dest_ptr_64.add(1);
+                    remaining -= 8;
+                }
+
+                dest_ptr = dest_ptr_64 as *mut u8;
+                for i in 0..remaining {
+                    *dest_ptr.add(i) = *src.add(i % size);
                 }
             } else {
                 for i in 0..total_size {
diff --git a/known_bugs.md b/known_bugs.md
new file mode 100644
index 0000000..bdb8fca
--- /dev/null
+++ b/known_bugs.md
@@ -0,0 +1,2 @@
+# i did not know where to put this
+- memcpy / memset cause crash on debug builds due to ptr misalignment that is not present on release builds
\ No newline at end of file
diff --git a/sysdata/libraries/render/src/software.hb b/sysdata/libraries/render/src/software.hb
index 01107df..32c1c1d 100644
--- a/sysdata/libraries/render/src/software.hb
+++ b/sysdata/libraries/render/src/software.hb
@@ -68,25 +68,32 @@ put_pixel := fn(pos: Vec2(int), color: Color): void {
 }
 
 put_filled_rect := fn(pos: Vec2(int), tr: Vec2(int), color: Color): void {
-	y := pos.y
-	end_y := y + tr.y
-	loop if y == end_y break else {
-		@inline(memory.set, Color, &color, ctx.buf + @inline(screenidx, pos.x, y), @bitcast(tr.x))
-		y += 1
+	start_idx := @inline(screenidx, pos.x, pos.y)
+	end_idx := @inline(screenidx, pos.x, pos.y + tr.y)
+
+	loop if start_idx >= end_idx break else {
+		@inline(memory.set, Color, &color, ctx.buf + start_idx, @bitcast(tr.x))
+		start_idx += ctx.width
 	}
+
 	return
 }
 
 put_rect := fn(pos: Vec2(int), tr: Vec2(int), color: Color): void {
-	y := pos.y
-	end_y := y + tr.y
-	loop if y == end_y break else {
-		*(ctx.buf + @inline(screenidx, pos.x, y)) = color;
-		*(ctx.buf + @inline(screenidx, pos.x + tr.x, y)) = color
-		y += 1
+	start_idx := @inline(screenidx, pos.x, pos.y)
+	end_idx := @inline(screenidx, pos.x, pos.y + tr.y)
+	right_start_idx := @inline(screenidx, pos.x + tr.x, pos.y)
+
+	loop if start_idx > end_idx break else {
+		*(ctx.buf + start_idx) = color;
+		*(ctx.buf + right_start_idx) = color
+		start_idx += ctx.width
+		right_start_idx += ctx.width
 	}
-	@inline(memory.set, Color, &color, ctx.buf + @inline(screenidx, pos.x, y), @bitcast(tr.x))
-	@inline(memory.set, Color, &color, ctx.buf + @inline(screenidx, pos.x, y - tr.y), @bitcast(tr.x))
+
+	@inline(memory.set, Color, &color, ctx.buf + @inline(screenidx, pos.x, pos.y), @bitcast(tr.x + 1))
+	@inline(memory.set, Color, &color, ctx.buf + @inline(screenidx, pos.x, pos.y + tr.y), @bitcast(tr.x + 1))
+
 	return
 }
 
@@ -172,10 +179,21 @@ set_dimensions := fn(new: Vec2(int)): void {
 }
 
 put_image := fn(image: Image, pos: Vec2(int)): void {
-	y := 0
-	loop if y == image.height break else {
-		@inline(memory.copy, Color, image.buf + y * image.width, ctx.buf + @inline(screenidx, pos.x, pos.y + image.height - y), @intcast(image.width))
-		y += 1
+	// y := 0
+	// loop if y == image.height break else {
+	// 	@inline(memory.copy, Color, image.buf + y * image.width, ctx.buf + @inline(screenidx, pos.x, pos.y + image.height - y), @intcast(image.width))
+	// 	y += 1
+	// }
+	// return
+
+	start_idx := @inline(screenidx, pos.x, pos.y)
+	end_idx := @inline(screenidx, pos.x, pos.y + image.height)
+	cursor := image.width * image.height
+
+	loop if start_idx >= end_idx break else {
+		@inline(memory.copy, Color, image.buf + cursor, ctx.buf + start_idx, @intcast(image.width))
+		start_idx += ctx.width
+		cursor -= image.width
 	}
 	return
 }