From 820c3e459bbc00547d912c5a1e566b0bdf5f81a5 Mon Sep 17 00:00:00 2001 From: koniifer Date: Mon, 14 Oct 2024 01:31:23 +0100 Subject: [PATCH] optimisations --- .../holeybytes/kernel_services/mem_serve.rs | 117 ++++++++++++++++-- known_bugs.md | 2 + sysdata/libraries/render/src/software.hb | 52 +++++--- 3 files changed, 142 insertions(+), 29 deletions(-) create mode 100644 known_bugs.md diff --git a/kernel/src/holeybytes/kernel_services/mem_serve.rs b/kernel/src/holeybytes/kernel_services/mem_serve.rs index e99a1aa..2c01f0d 100644 --- a/kernel/src/holeybytes/kernel_services/mem_serve.rs +++ b/kernel/src/holeybytes/kernel_services/mem_serve.rs @@ -24,6 +24,7 @@ fn alloc_page(vm: &mut Vm, _mem_addr: u64, _length: usize) -> Result<(), MemoryS Ok(()) } +#[inline(always)] pub fn memory_msg_handler( vm: &mut Vm, mem_addr: u64, @@ -82,32 +83,124 @@ pub fn memory_msg_handler( let page_count = msg_vec[1]; log::debug!(" {} pages", page_count); } - // memcpy + // trash but fast memcpy 4 => unsafe { let count = u64::from_le_bytes(msg_vec[1..9].try_into().unwrap_unchecked()) as usize; let src = u64::from_le_bytes(msg_vec[9..17].try_into().unwrap_unchecked()) as *const u8; let dest = u64::from_le_bytes(msg_vec[17..25].try_into().unwrap_unchecked()) as *mut u8; - src.copy_to(dest, count); + + let mut src_ptr = src; + let mut dest_ptr = dest; + let mut remaining = count; + + while (dest_ptr as usize) & 7 != 0 && remaining > 0 { + *dest_ptr = *src_ptr; + src_ptr = src_ptr.add(1); + dest_ptr = dest_ptr.add(1); + remaining -= 1; + } + + let mut src_ptr_64 = src_ptr as *const u64; + let mut dest_ptr_64 = dest_ptr as *mut u64; + while remaining >= 64 { + let (s1, s2, s3, s4, s5, s6, s7, s8) = ( + *src_ptr_64, + *src_ptr_64.add(1), + *src_ptr_64.add(2), + *src_ptr_64.add(3), + *src_ptr_64.add(4), + *src_ptr_64.add(5), + *src_ptr_64.add(6), + *src_ptr_64.add(7), + ); + *dest_ptr_64 = s1; + *dest_ptr_64.add(1) = s2; + *dest_ptr_64.add(2) = s3; + *dest_ptr_64.add(3) = s4; + *dest_ptr_64.add(4) = s5; + *dest_ptr_64.add(5) = s6; + *dest_ptr_64.add(6) = s7; + *dest_ptr_64.add(7) = s8; + src_ptr_64 = src_ptr_64.add(8); + dest_ptr_64 = dest_ptr_64.add(8); + remaining -= 64; + } + + while remaining >= 8 { + *dest_ptr_64 = *src_ptr_64; + src_ptr_64 = src_ptr_64.add(1); + dest_ptr_64 = dest_ptr_64.add(1); + remaining -= 8; + } + + src_ptr = src_ptr_64 as *const u8; + dest_ptr = dest_ptr_64 as *mut u8; + for _ in 0..remaining { + *dest_ptr = *src_ptr; + src_ptr = src_ptr.add(1); + dest_ptr = dest_ptr.add(1); + } }, - // memset + + // trash but fast memset 5 => unsafe { let count = u64::from_le_bytes(msg_vec[1..9].try_into().unwrap_unchecked()) as usize; let size = u64::from_le_bytes(msg_vec[9..17].try_into().unwrap_unchecked()) as usize; let dest = u64::from_le_bytes(msg_vec[17..25].try_into().unwrap_unchecked()) as *mut u8; - let src = u64::from_le_bytes(msg_vec[25..33].try_into().unwrap_unchecked()) as *mut u8; + let src = + u64::from_le_bytes(msg_vec[25..33].try_into().unwrap_unchecked()) as *const u8; let total_size = count * size; if total_size > 32 { - core::ptr::copy(src, dest, size); - let pattern = core::slice::from_raw_parts(dest, size); - let mut offset = size; + let mut pattern_512 = [0u8; 64]; + for i in 0..64 { + pattern_512[i] = *src.add(i % size); + } + let pattern_512_ptr = pattern_512.as_ptr() as *const u64; - while offset < total_size { - let remaining = total_size - offset; - let copy_size = remaining.min(offset); - core::ptr::copy_nonoverlapping(pattern.as_ptr(), dest.add(offset), copy_size); - offset += copy_size; + let mut dest_ptr = dest; + let mut remaining = total_size; + + while (dest_ptr as usize) & 7 != 0 && remaining > 0 { + *dest_ptr = *src; + dest_ptr = dest_ptr.add(1); + remaining -= 1; + } + + let mut dest_ptr_64 = dest_ptr as *mut u64; + while remaining >= 64 { + let (p1, p2, p3, p4, p5, p6, p7, p8) = ( + *pattern_512_ptr, + *pattern_512_ptr.add(1), + *pattern_512_ptr.add(2), + *pattern_512_ptr.add(3), + *pattern_512_ptr.add(4), + *pattern_512_ptr.add(5), + *pattern_512_ptr.add(6), + *pattern_512_ptr.add(7), + ); + *dest_ptr_64 = p1; + *dest_ptr_64.add(1) = p2; + *dest_ptr_64.add(2) = p3; + *dest_ptr_64.add(3) = p4; + *dest_ptr_64.add(4) = p5; + *dest_ptr_64.add(5) = p6; + *dest_ptr_64.add(6) = p7; + *dest_ptr_64.add(7) = p8; + dest_ptr_64 = dest_ptr_64.add(8); + remaining -= 64; + } + + while remaining >= 8 { + *dest_ptr_64 = *pattern_512_ptr; + dest_ptr_64 = dest_ptr_64.add(1); + remaining -= 8; + } + + dest_ptr = dest_ptr_64 as *mut u8; + for i in 0..remaining { + *dest_ptr.add(i) = *src.add(i % size); } } else { for i in 0..total_size { diff --git a/known_bugs.md b/known_bugs.md new file mode 100644 index 0000000..bdb8fca --- /dev/null +++ b/known_bugs.md @@ -0,0 +1,2 @@ +# i did not know where to put this +- memcpy / memset cause crash on debug builds due to ptr misalignment that is not present on release builds \ No newline at end of file diff --git a/sysdata/libraries/render/src/software.hb b/sysdata/libraries/render/src/software.hb index 01107df..32c1c1d 100644 --- a/sysdata/libraries/render/src/software.hb +++ b/sysdata/libraries/render/src/software.hb @@ -68,25 +68,32 @@ put_pixel := fn(pos: Vec2(int), color: Color): void { } put_filled_rect := fn(pos: Vec2(int), tr: Vec2(int), color: Color): void { - y := pos.y - end_y := y + tr.y - loop if y == end_y break else { - @inline(memory.set, Color, &color, ctx.buf + @inline(screenidx, pos.x, y), @bitcast(tr.x)) - y += 1 + start_idx := @inline(screenidx, pos.x, pos.y) + end_idx := @inline(screenidx, pos.x, pos.y + tr.y) + + loop if start_idx >= end_idx break else { + @inline(memory.set, Color, &color, ctx.buf + start_idx, @bitcast(tr.x)) + start_idx += ctx.width } + return } put_rect := fn(pos: Vec2(int), tr: Vec2(int), color: Color): void { - y := pos.y - end_y := y + tr.y - loop if y == end_y break else { - *(ctx.buf + @inline(screenidx, pos.x, y)) = color; - *(ctx.buf + @inline(screenidx, pos.x + tr.x, y)) = color - y += 1 + start_idx := @inline(screenidx, pos.x, pos.y) + end_idx := @inline(screenidx, pos.x, pos.y + tr.y) + right_start_idx := @inline(screenidx, pos.x + tr.x, pos.y) + + loop if start_idx > end_idx break else { + *(ctx.buf + start_idx) = color; + *(ctx.buf + right_start_idx) = color + start_idx += ctx.width + right_start_idx += ctx.width } - @inline(memory.set, Color, &color, ctx.buf + @inline(screenidx, pos.x, y), @bitcast(tr.x)) - @inline(memory.set, Color, &color, ctx.buf + @inline(screenidx, pos.x, y - tr.y), @bitcast(tr.x)) + + @inline(memory.set, Color, &color, ctx.buf + @inline(screenidx, pos.x, pos.y), @bitcast(tr.x + 1)) + @inline(memory.set, Color, &color, ctx.buf + @inline(screenidx, pos.x, pos.y + tr.y), @bitcast(tr.x + 1)) + return } @@ -172,10 +179,21 @@ set_dimensions := fn(new: Vec2(int)): void { } put_image := fn(image: Image, pos: Vec2(int)): void { - y := 0 - loop if y == image.height break else { - @inline(memory.copy, Color, image.buf + y * image.width, ctx.buf + @inline(screenidx, pos.x, pos.y + image.height - y), @intcast(image.width)) - y += 1 + // y := 0 + // loop if y == image.height break else { + // @inline(memory.copy, Color, image.buf + y * image.width, ctx.buf + @inline(screenidx, pos.x, pos.y + image.height - y), @intcast(image.width)) + // y += 1 + // } + // return + + start_idx := @inline(screenidx, pos.x, pos.y) + end_idx := @inline(screenidx, pos.x, pos.y + image.height) + cursor := image.width * image.height + + loop if start_idx >= end_idx break else { + @inline(memory.copy, Color, image.buf + cursor, ctx.buf + start_idx, @intcast(image.width)) + start_idx += ctx.width + cursor -= image.width } return }