From 0e105921e77796e83d01c2aa4f4cadfa2005b4d9 Mon Sep 17 00:00:00 2001 From: Victor Jung <33875047+Victor-Jung@users.noreply.github.com> Date: Thu, 27 Jun 2024 16:13:28 +0200 Subject: [PATCH] Add support for byte-wise Load and Store and fix DMA model (#18) * [feature] Add support for bytewise Load and Store and fix DMA model --- src/engine.rs | 64 +++++++++++++++++++++++++++++++++++----------- src/peripherals.rs | 2 +- src/runtime/jit.ll | 4 +-- src/runtime/jit.rs | 27 ++++++------------- src/tran.rs | 36 +++----------------------- 5 files changed, 64 insertions(+), 69 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 216996b..f32a539 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -714,6 +714,12 @@ impl<'a, 'b> Cpu<'a, 'b> { } pub fn binary_load(&self, addr: u32, size: u8) -> u32 { + if (addr % 4) % (1 << size) == 0 { + warn!( + "Hart {} (pc=0x{:08x}) is doing an unaligned load at 0x{:08x}", + self.hartid, self.state.pc, addr + ); + } match addr { x if x == self.engine.config.address.tcdm_start => { self.engine.config.memory.tcdm.start @@ -756,8 +762,6 @@ impl<'a, 'b> Cpu<'a, 'b> { + self.engine.config.memory.tcdm.size) }) => { - trace!("TCDM Binary Load"); - trace!("Binary load address: 0x{:x}", x); let id = (0..self.engine.num_clusters) .position(|i| { addr >= (self.engine.config.memory.tcdm.start @@ -775,7 +779,13 @@ impl<'a, 'b> Cpu<'a, 'b> { let word_offs = tcdm_addr - 4 * word_addr; let ptr: *const u32 = self.tcdm_ptr[id]; let word = unsafe { *ptr.offset(word_addr as isize) }; - (word >> (8 * word_offs)) & ((((1 as u64) << (8 << size)) - 1) as u32) + let val = (word >> (8 * word_offs)) & ((((1 as u64) << (8 << size)) - 1) as u32); + trace!( + "TCDM Load: addr: 0x{:x} value: 0x{:x}", + x, + (word >> (8 * word_offs)) & ((((1 as u64) << (8 << size)) - 1) as u32) + ); + val } // Peripherals x if (0..self.engine.num_clusters).any(|i| { @@ -842,19 +852,41 @@ impl<'a, 'b> Cpu<'a, 'b> { self.hartid, self.state.pc, addr ); } - // trace!("Load 0x{:x} ({}B)", addr, 8 << size); - self.engine + let word_offset = addr % 4; + let mask = (!(u64::MAX << (8 << size))) as u32; + let shift = 8 * (word_offset); + let word = ((self + .engine .memory .lock() .unwrap() - .get(&(addr as u64)) + .get(&((addr - word_offset) as u64)) .copied() - .unwrap_or(0) + .unwrap_or(0)) + >> shift) + & mask; + trace!( + "DRAM Load: addr 0x{:x} value 0x{:x} shift {} mask 0x{:x} ({}B)", + addr, + word, + shift, + mask, + 8 << size + ); + word as u32 } } } - pub fn binary_store(&self, addr: u32, value: u32, mask: u32, size: u8) { + pub fn binary_store(&self, addr: u32, value: u32, size: u8) { + if (addr % 4) % (1 << size) == 0 { + warn!( + "Hart {} (pc=0x{:08x}) is doing an unaligned store at 0x{:08x}", + self.hartid, self.state.pc, addr + ); + } + let word_offset = addr % 4; + let mask = ((((1 as u64) << (8 << size)) - 1) << (8 * word_offset)) as u32; match addr { x if x == self.engine.config.address.tcdm_start => (), // tcdm_start x if x == self.engine.config.address.tcdm_end => (), // tcdm_end @@ -873,6 +905,7 @@ impl<'a, 'b> Cpu<'a, 'b> { x if x == self.engine.config.address.uart => { let mut buffer = self.engine.putchar_buffer.lock().unwrap(); let buffer = buffer.entry(self.hartid).or_default(); + trace!("UART Store: addr 0x{:x} value 0x{:x}", addr, value); if value == '\n' as u32 { eprintln!( "{}{} hart-{:03} {} {}", @@ -917,11 +950,10 @@ impl<'a, 'b> Cpu<'a, 'b> { let word_offs = tcdm_addr - 4 * word_addr; let ptr = self.tcdm_ptr[id] as *const u32; let ptr_mut = ptr as *mut u32; - let wmask = ((((1 as u64) << (8 << size)) - 1) as u32) << (8 * word_offs); unsafe { let word_ptr = ptr_mut.offset(word_addr as isize); let word = *word_ptr; - *word_ptr = (word & !wmask) | ((value << (8 * word_offs)) & wmask); + *word_ptr = (word & !mask) | ((value << (8 * word_offs)) & mask); } } // Peripherals @@ -1013,16 +1045,18 @@ impl<'a, 'b> Cpu<'a, 'b> { ); } trace!( - "Store 0x{:x} = 0x{:x} if 0x{:x} ({}B)", + "DRAM Store: addr 0x{:x} value 0x{:x} mask 0x{:x} ({}B)", addr, value, mask, 8 << size ); + let offset_addr = addr - word_offset; let mut data = self.engine.memory.lock().unwrap(); - let data = data.entry(addr as u64).or_default(); + let data = data.entry(offset_addr as u64).or_default(); + let shifted_value = value << 8 * (addr % 4); *data &= !mask; - *data |= value & mask; + *data |= shifted_value & mask; } } } @@ -1062,14 +1096,14 @@ impl<'a, 'b> Cpu<'a, 'b> { // Aligned transfer for _ in 0..n / 4 { let tmp = self.binary_load(src, 2); - self.binary_store(dest, tmp, u32::MAX, 2); + self.binary_store(dest, tmp, 2); src += 4; dest += 4; } } else { for _ in 0..n { let tmp = self.binary_load(src, 0); - self.binary_store(dest, tmp, (u8::MAX as u32) << (8 * (dest % 4)), 0); + self.binary_store(dest, tmp, 0); src += 1; dest += 1; } diff --git a/src/peripherals.rs b/src/peripherals.rs index 405bd42..eb76927 100644 --- a/src/peripherals.rs +++ b/src/peripherals.rs @@ -535,7 +535,7 @@ impl MemPoolITA { data[[j as usize, ((n / splits) * split + i) as usize + offset]] as u8; } let word = u32::from_ne_bytes(elements); - cpu.binary_store(address + address_offset, word, u32::MAX, 2); + cpu.binary_store(address + address_offset, word, 2); debug!( "[ITA, CPU {}] Store OUT to 0x{:x}", &cpu.hartid, diff --git a/src/runtime/jit.ll b/src/runtime/jit.ll index 23b9160..b52edb6 100644 --- a/src/runtime/jit.ll +++ b/src/runtime/jit.ll @@ -18,7 +18,7 @@ ; Forward declarations. declare i32 @banshee_load(%Cpu* %cpu, i32 %addr, i8 %size) -declare void @banshee_store(%Cpu* %cpu, i32 %addr, i32 %value, i32 %mask, i8 %size) +declare void @banshee_store(%Cpu* %cpu, i32 %addr, i32 %value, i8 %size) declare i32 @banshee_rmw(%Cpu* %cpu, i32 %addr, i32 %value, i8 %op) declare i32 @banshee_csr_read(%Cpu* %cpu, i16 %csr, i32 %notrace) declare void @banshee_csr_write(%Cpu* %cpu, i16 %csr, i32 %value, i32 %notrace) @@ -58,7 +58,7 @@ declare float @banshee_fp16_to_fp32_op(i16 %rs1, i16 %rs2, float %rs3, i8 %op, i declare i16 @banshee_fp8_to_fp16_op(i8 %rs1, i8 %rs2, i16 %rs3, i8 %op, i1 %fpmode_src, i1 %fpmode_dst) declare float @banshee_fp8_to_fp32_op(i8 %rs1, i8 %rs2, float %rs3, i8 %op, i1 %fpmode_src) -declare void @banshee_ssr_write_cfg(%SsrState* %ssr, %Cpu* %cpu, i32 %addr, i32 %value, i32 %mask) +declare void @banshee_ssr_write_cfg(%SsrState* %ssr, %Cpu* %cpu, i32 %addr, i32 %value) declare i32 @banshee_ssr_read_cfg(%SsrState* readonly %ssr, i32 %addr) declare i32 @banshee_ssr_next(%SsrState* %ssr, %Cpu* %cpu) declare void @banshee_ssr_eoi(%SsrState* %ssr) diff --git a/src/runtime/jit.rs b/src/runtime/jit.rs index 86c19ff..26903d8 100644 --- a/src/runtime/jit.rs +++ b/src/runtime/jit.rs @@ -107,13 +107,11 @@ pub unsafe fn banshee_ssr_write_cfg( ssr: &mut SsrState, cpu: &mut Cpu, addr: u32, - value: u32, - mask: u32, + value: u32 ) { extern "C" { fn banshee_load(cpu: &mut Cpu, addr: u32, size: u8) -> u32; } - // TODO: Handle the mask! let addr = addr as usize / 8; let mut set_ptr = 0; match addr { @@ -269,33 +267,24 @@ pub unsafe fn banshee_dma_rep(dma: &mut DmaState, reps: u32) { pub unsafe fn banshee_dma_strt(dma: &mut DmaState, cpu: &mut Cpu, size: u32, flags: u32) -> u32 { extern "C" { fn banshee_load(cpu: &mut Cpu, addr: u32, size: u8) -> u32; - fn banshee_store(cpu: &mut Cpu, addr: u32, value: u32, mask: u32, size: u8); + fn banshee_store(cpu: &mut Cpu, addr: u32, value: u32, size: u8); } let id = dma.done_id; dma.done_id += 1; dma.size = size; - // assert_eq!( - // size % 4, - // 0, - // "DMA transfer size must be a multiple of 4B for now" - // ); - let num_beats = size / 4; let enable_2d = (flags & (1 << 1)) != 0; let steps = if enable_2d { dma.reps } else { 1 }; - + for i in 0..steps as u64 { - let src = dma.src + i * dma.src_stride as u64; - let dst = dma.dst + i * dma.dst_stride as u64; - // assert_eq!(src % 4, 0, "DMA src transfer block must be 4-byte-aligned"); - // assert_eq!(dst % 4, 0, "DMA dst transfer block must be 4-byte-aligned"); - for j in 0..num_beats as u64 { - let tmp = banshee_load(cpu, (src + j * 4) as u32, 2); - banshee_store(cpu, (dst + j * 4) as u32, tmp, u32::max_value(), 2); + let mut src = dma.src + i * dma.src_stride as u64; + let mut dst = dma.dst + i * dma.dst_stride as u64; + for j in 0..size as u64 { + let tmp = banshee_load(cpu, (src + j) as u32, 0); + banshee_store(cpu, (dst + j) as u32, tmp, 0); } } - id } diff --git a/src/tran.rs b/src/tran.rs index 866b9d9..1054744 100644 --- a/src/tran.rs +++ b/src/tran.rs @@ -1149,7 +1149,7 @@ impl<'a> InstructionTranslator<'a> { [ self.section.state_ptr, addr, - LLVMConstInt(LLVMInt8Type(), 4 as u64, 0), + LLVMConstInt(LLVMInt8Type(), 2 as u64, 0), ] .as_mut_ptr(), 3, @@ -6459,14 +6459,8 @@ impl<'a> InstructionTranslator<'a> { ), [ self.section.state_ptr, - // LLVMBuildBitCast( - // self.builder, - // self.section.state_ptr, - // LLVMPointerType(LLVMInt8Type(), 0), - // NONAME, - // ), aligned_addr, - LLVMConstInt(LLVMInt8Type(), size as u64, 0), + LLVMConstInt(LLVMInt8Type(), 2 as u64, 0), // JUNGVI: Set size to 2 in this case as we align the read after the phi block. ] .as_mut_ptr(), 3, @@ -6563,27 +6557,6 @@ impl<'a> InstructionTranslator<'a> { NONAME, ); - // Compute the misalignment. - let shift = LLVMBuildAnd( - self.builder, - addr, - LLVMConstInt(LLVMInt32Type(), 3, 0), - NONAME, - ); - let shift = LLVMBuildMul( - self.builder, - shift, - LLVMConstInt(LLVMInt32Type(), 8, 0), - NONAME, - ); - - // Align the data to the address and generate a bit mask. - let mask = LLVMConstNull(ty); - let mask = LLVMBuildNot(self.builder, mask, NONAME); - let mask = LLVMBuildZExt(self.builder, mask, LLVMInt32Type(), NONAME); - let mask = LLVMBuildShl(self.builder, mask, shift, NONAME); - let value = LLVMBuildShl(self.builder, value, shift, NONAME); - // Check if the address is in the SSR configuration space. let (is_ssr, ssr_ptr, ssr_addr) = self.emit_ssr_check(aligned_addr); let bb_ssr = LLVMCreateBasicBlockInContext(self.section.engine.context, NONAME); @@ -6596,7 +6569,7 @@ impl<'a> InstructionTranslator<'a> { LLVMPositionBuilderAtEnd(self.builder, bb_ssr); self.section.emit_call( "banshee_ssr_write_cfg", - [ssr_ptr, self.section.state_ptr, ssr_addr, value, mask], + [ssr_ptr, self.section.state_ptr, ssr_addr, value], ); LLVMBuildBr(self.builder, bb_end); LLVMPositionBuilderAtEnd(self.builder, bb_nossr); @@ -6606,9 +6579,8 @@ impl<'a> InstructionTranslator<'a> { "banshee_store", [ self.section.state_ptr, - aligned_addr, + addr, value, - mask, LLVMConstInt(LLVMInt8Type(), size as u64, 0), ], );