From 55fded5b14550a0682573049cc1ea9195f869f25 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Sat, 9 Nov 2024 14:23:50 +0100 Subject: [PATCH 1/8] [hardware] Simplify the operand queues --- hardware/src/lane/operand_queues_stage.sv | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/hardware/src/lane/operand_queues_stage.sv b/hardware/src/lane/operand_queues_stage.sv index 1445236c0..d03e8a6a4 100644 --- a/hardware/src/lane/operand_queues_stage.sv +++ b/hardware/src/lane/operand_queues_stage.sv @@ -57,7 +57,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math operand_queue #( .CmdBufDepth (ValuInsnQueueDepth ), .DataBufDepth (5 ), - .FPUSupport (FPUSupport ), + .FPUSupport (FPUSupportNone ), .NrLanes (NrLanes ), .VLEN (VLEN ), .SupportIntExt2 (1'b1 ), @@ -86,7 +86,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math operand_queue #( .CmdBufDepth (ValuInsnQueueDepth ), .DataBufDepth (5 ), - .FPUSupport (FPUSupport ), + .FPUSupport (FPUSupportNone ), .NrLanes (NrLanes ), .VLEN (VLEN ), .SupportIntExt2 (1'b1 ), @@ -204,7 +204,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math operand_queue #( .CmdBufDepth (VstuInsnQueueDepth + MaskuInsnQueueDepth), .DataBufDepth (2 ), - .FPUSupport (FPUSupport ), + .FPUSupport (FPUSupportNone ), .NrLanes (NrLanes ), .VLEN (VLEN ), .operand_queue_cmd_t(operand_queue_cmd_t ) @@ -248,7 +248,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math operand_queue #( .CmdBufDepth (VlduInsnQueueDepth ), .DataBufDepth (2 ), - .FPUSupport (FPUSupport ), + .FPUSupport (FPUSupportNone ), .NrLanes (NrLanes ), .VLEN (VLEN ), .operand_queue_cmd_t(operand_queue_cmd_t ) @@ -276,7 +276,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math operand_queue #( .CmdBufDepth (MaskuInsnQueueDepth ), .DataBufDepth (1 ), - .FPUSupport (FPUSupport ), + .FPUSupport (FPUSupportNone ), .SupportIntExt2 (1'b1 ), .SupportIntExt4 (1'b1 ), .SupportIntExt8 (1'b1 ), @@ -303,6 +303,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math operand_queue #( .CmdBufDepth (MaskuInsnQueueDepth ), .DataBufDepth (1 ), + .FPUSupport (FPUSupportNone ), .NrLanes (NrLanes ), .VLEN (VLEN ), .operand_queue_cmd_t(operand_queue_cmd_t ) From a570818cf68ab60978cd11eb5573fd7048a01619 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Sat, 9 Nov 2024 14:24:12 +0100 Subject: [PATCH 2/8] [hardware] Slim down addrgen check function --- hardware/src/vlsu/addrgen.sv | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv index ed3bc59bc..005116546 100644 --- a/hardware/src/vlsu/addrgen.sv +++ b/hardware/src/vlsu/addrgen.sv @@ -92,8 +92,13 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( import axi_pkg::CACHE_MODIFIABLE; // Check if the address is aligned to a particular width + // Max element width: 8 bytes function automatic logic is_addr_error(axi_addr_t addr, logic [1:0] vew); - is_addr_error = |(addr & (elen_t'(1 << vew) - 1)); + // log2(MAX_ELEMENT_WIDTH_BYTE) + localparam LOG2_MAX_SEW_BYTE = 3; + typedef logic [LOG2_MAX_SEW_BYTE:0] max_sew_byte_t; + + is_addr_error = |(max_sew_byte_t'(addr[LOG2_MAX_SEW_BYTE-1:0]) & (max_sew_byte_t'(1 << vew) - 1)); endfunction // is_addr_error //////////////////// @@ -332,7 +337,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( }; // Ara does not support misaligned AXI requests - if (is_addr_error(pe_req_q.scalar_op, pe_req_q.vtype.vsew)) begin + if (is_addr_error(pe_req_q.scalar_op, pe_req_q.vtype.vsew[1:0])) begin state_d = IDLE; addrgen_ack_o = 1'b1; addrgen_exception_o.valid = 1'b1; @@ -926,7 +931,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Check if the virtual address generates an exception // NOTE: we can do this even before address translation, since the // page offset (2^12) is the same for both physical and virtual addresses - if (is_addr_error(idx_final_vaddr_q, axi_addrgen_q.vew)) begin : eew_misaligned_error + if (is_addr_error(idx_final_vaddr_q, axi_addrgen_q.vew[1:0])) begin : eew_misaligned_error // Generate an error idx_op_error_d = 1'b1; // Forward next vstart info to the dispatcher From 66c4f34193935b05ffc34ee7c2331a212241e874 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Sat, 9 Nov 2024 14:24:41 +0100 Subject: [PATCH 3/8] [hardware] Replace stream regs with spill regs --- hardware/src/lane/valu.sv | 4 +--- hardware/src/lane/vmfpu.sv | 4 +--- hardware/src/sldu/sldu.sv | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv index 7f2be6614..d3ce82bee 100644 --- a/hardware/src/lane/valu.sv +++ b/hardware/src/lane/valu.sv @@ -180,13 +180,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; assign mask_operand_gnt = mask_operand_ready && result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q]; - stream_register #( + spill_register #( .T(elen_t) ) i_mask_operand_register ( .clk_i (clk_i ), .rst_ni (rst_ni ), - .clr_i (1'b0 ), - .testmode_i(1'b0 ), .data_o (mask_operand_o ), .valid_o (mask_operand_valid_o ), .ready_i (mask_operand_ready_i ), diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv index 4822ce7b7..fdf10363a 100644 --- a/hardware/src/lane/vmfpu.sv +++ b/hardware/src/lane/vmfpu.sv @@ -245,13 +245,11 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; assign mask_operand_gnt = mask_operand_ready && result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q]; - stream_register #( + spill_register #( .T(elen_t) ) i_mask_operand_register ( .clk_i (clk_i ), .rst_ni (rst_ni ), - .clr_i (1'b0 ), - .testmode_i(1'b0 ), .data_o (mask_operand_o ), .valid_o (mask_operand_valid_o ), .ready_i (mask_operand_ready_i ), diff --git a/hardware/src/sldu/sldu.sv b/hardware/src/sldu/sldu.sv index 423ee092a..66527b442 100644 --- a/hardware/src/sldu/sldu.sv +++ b/hardware/src/sldu/sldu.sv @@ -239,13 +239,11 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( logic [NrLanes-1:0] mask_ready_q; for (genvar l = 0; l < NrLanes; l++) begin - stream_register #( + spill_register #( .T(strb_t) ) i_mask_operand_register ( .clk_i (clk_i ), .rst_ni (rst_ni ), - .clr_i (1'b0 ), - .testmode_i(1'b0 ), .data_o (mask_q[l] ), .valid_o (mask_valid_q[l] ), .ready_i (mask_ready_d ), From 5995d455ed7d76b060d2f2e71ee75134a46dc0a4 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Mon, 25 Nov 2024 14:28:36 +0100 Subject: [PATCH 4/8] [hardware] Fix some latches --- hardware/src/lane/operand_queue.sv | 2 +- hardware/src/lane/vmfpu.sv | 2 +- hardware/src/vlsu/addrgen.sv | 10 ++++++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/hardware/src/lane/operand_queue.sv b/hardware/src/lane/operand_queue.sv index 640d77f14..0ccdcc6ae 100644 --- a/hardware/src/lane/operand_queue.sv +++ b/hardware/src/lane/operand_queue.sv @@ -209,7 +209,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i last_packet = 1'b0; for (int i = 0; i < 2; i++) fp16[i] = '0; - for (int i = 0; i < 1; i++) fp32[i] = '0; + fp32 = '0; // Reductions need to mask away the inactive elements // A temporary solution is to send a neutral value directly diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv index fdf10363a..bbeb78f32 100644 --- a/hardware/src/lane/vmfpu.sv +++ b/hardware/src/lane/vmfpu.sv @@ -1391,7 +1391,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; : {vinsn_issue_q.use_vd_op, vinsn_issue_q.use_vs2, vinsn_issue_q.use_vs1}; for (int i = 0; i < 2; i++) fp16[i] = '0; - for (int i = 0; i < 1; i++) fp32[i] = '0; + fp32 = '0; first_op_d = first_op_q; simd_red_cnt_d = simd_red_cnt_q; diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv index 005116546..2a21199a3 100644 --- a/hardware/src/vlsu/addrgen.sv +++ b/hardware/src/vlsu/addrgen.sv @@ -232,8 +232,14 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( addrgen_req_valid = 1'b0; // Nothing to acknowledge - addrgen_ack_o = 1'b0; - addrgen_exception_o = '0; + addrgen_ack_o = 1'b0; + addrgen_exception_o = '0; + addrgen_exception_o.valid = 1'b0; + addrgen_exception_o.gva = '0; + addrgen_exception_o.tinst = '0; + addrgen_exception_o.tval = '0; + addrgen_exception_o.tval2 = '0; + addrgen_exception_o.cause = '0; addrgen_illegal_load_o = 1'b0; addrgen_illegal_store_o = 1'b0; From b04759de09180f113dd0afe49d3904b9e96f6539 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Sat, 9 Nov 2024 15:16:55 +0100 Subject: [PATCH 5/8] [apps] Improve MASKU riscv-tests --- apps/riscv-tests/isa/macros/vector/dataset.h | 2 +- .../isa/macros/vector/vector_macros.h | 11 + apps/riscv-tests/isa/rv64uv/vcpop.c | 86 +++++- apps/riscv-tests/isa/rv64uv/vid.c | 13 +- apps/riscv-tests/isa/rv64uv/viota.c | 247 +++++++++++++++++- apps/riscv-tests/isa/rv64uv/vmsbf.c | 135 +++++++++- apps/riscv-tests/isa/rv64uv/vmseq.c | 59 +++++ apps/riscv-tests/isa/rv64uv/vmsif.c | 50 +++- apps/riscv-tests/isa/rv64uv/vmsof.c | 18 +- apps/script/viota.py | 57 ++++ 10 files changed, 652 insertions(+), 26 deletions(-) create mode 100755 apps/script/viota.py diff --git a/apps/riscv-tests/isa/macros/vector/dataset.h b/apps/riscv-tests/isa/macros/vector/dataset.h index 90baacdae..cfad9409f 100644 --- a/apps/riscv-tests/isa/macros/vector/dataset.h +++ b/apps/riscv-tests/isa/macros/vector/dataset.h @@ -7,7 +7,7 @@ #ifndef __DATASET_H__ #define __DATASET_H__ -#define SIZE 64 +#define SIZE 1024 #define L_SIZE 1024 static volatile uint64_t Au64[SIZE] __attribute__((aligned(128))); diff --git a/apps/riscv-tests/isa/macros/vector/vector_macros.h b/apps/riscv-tests/isa/macros/vector/vector_macros.h index d303a7cb9..9193717e1 100644 --- a/apps/riscv-tests/isa/macros/vector/vector_macros.h +++ b/apps/riscv-tests/isa/macros/vector/vector_macros.h @@ -209,6 +209,17 @@ int test_case; asm volatile("vsetvl zero, %[vl], %[vtype]" :: [vl] "r" (vl), [vtype] "r" (vtype)); \ } while(0) +#define VCLEAR_AT_ONE(register) \ + do { \ + MEMORY_BARRIER; \ + uint64_t vtype; uint64_t vl; uint64_t vlmax; \ + asm volatile("csrr %[vtype], vtype" : [vtype] "=r" (vtype)); \ + asm volatile("csrr %[vl], vl" : [vl] "=r" (vl)); \ + asm volatile("vsetvl %[vlmax], zero, %[vtype]" : [vlmax] "=r" (vlmax) : [vtype] "r" (vtype)); \ + asm volatile("vmv.v.i "#register", -1"); \ + asm volatile("vsetvl zero, %[vl], %[vtype]" :: [vl] "r" (vl), [vtype] "r" (vtype)); \ + } while(0) + // Macro to initialize a vector with progressive values from a counter #define INIT_MEM_CNT(vec_name, size) \ counter = 0; \ diff --git a/apps/riscv-tests/isa/rv64uv/vcpop.c b/apps/riscv-tests/isa/rv64uv/vcpop.c index 0d0794db9..3c964215d 100644 --- a/apps/riscv-tests/isa/rv64uv/vcpop.c +++ b/apps/riscv-tests/isa/rv64uv/vcpop.c @@ -15,24 +15,90 @@ void TEST_CASE1(void) { VLOAD_32(v0, 5, 0, 0, 0); volatile uint32_t scalar = 1337; volatile uint32_t OUP[] = {0, 0, 0, 0}; - __asm__ volatile("vpopc.m %[A], v2, v0.t \n" - "sw %[A], (%1) \n" - : - : [A] "r"(scalar), "r"(OUP)); + asm volatile("vpopc.m %[A], v2, v0.t \n" + "sw %[A], (%1) \n" + : + : [A] "r"(scalar), "r"(OUP)); XCMP(1, OUP[0], 2); + + VSET(32, e32, m1); + VLOAD_32(v8, 0xFFFFFFF7FFFFFFFF, 0x88, 0x1, 0x1F, 0xFFFFFFF7FFFFFFFF, 0x88, + 0x1, 0x1F, 0xFFFFFFF7FFFFFFFF, 0x88, 0x1, 0x1F, 0xFFFFFFF7FFFFFFFF, + 0x88, 0x1, 0x1F, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + VLOAD_32(v0, 0xffffffffffffffff, 0xfffffffffffffff7, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, + 0xefffffffffffffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + VSET(1024, e8, m8); + asm volatile("vpopc.m %[A], v8, v0.t \n" + "sw %[A], (%1) \n" + : + : [A] "r"(scalar), "r"(OUP)); + XCMP(2, OUP[0], 159); } // unmasked void TEST_CASE2(void) { VSET(4, e32, m1); - VLOAD_32(v2, 0xF, 0, 0, 0); + VLOAD_32(v2, 0xFFFFFFF7FFFFFFFF, 0x88, 0x1, 0x1F); volatile uint32_t scalar = 1337; volatile uint32_t OUP[] = {0, 0, 0, 0}; - __asm__ volatile("vpopc.m %[A], v2 \n" - "sw %[A], (%1) \n" - : - : [A] "r"(scalar), "r"(OUP)); - XCMP(2, OUP[0], 4); + VSET(128, e32, m2); + asm volatile("vpopc.m %[A], v2 \n" + "sw %[A], (%1) \n" + : + : [A] "r"(scalar), "r"(OUP)); + XCMP(3, OUP[0], 40); + + VSET(8, e32, m1); + VLOAD_32(v0, 0xFFFFFFF7FFFFFFFF, 0x88, 0x1, 0x1F, 0xFFFFFFF7FFFFFFFF, 0x88, + 0x1, 0x1F); + VSET(256, e8, m8); + asm volatile("vpopc.m %[A], v0 \n" + "sw %[A], (%1) \n" + : + : [A] "r"(scalar), "r"(OUP)); + XCMP(4, OUP[0], 80); + + VSET(16, e32, m1); + VLOAD_32(v0, 0xFFFFFFF7FFFFFFFF, 0x88, 0x1, 0x1F, 0xFFFFFFF7FFFFFFFF, 0x88, + 0x1, 0x1F, 0xFFFFFFF7FFFFFFFF, 0x88, 0x1, 0x1F, 0xFFFFFFF7FFFFFFFF, + 0x88, 0x1, 0x1F); + VSET(1024, e8, m8); + asm volatile("vpopc.m %[A], v0 \n" + "sw %[A], (%1) \n" + : + : [A] "r"(scalar), "r"(OUP)); + XCMP(5, OUP[0], 160); + + VSET(8, e32, m1); + VLOAD_32(v2, 0xFFFFFFF7FFFFFFFF, 0x88, 0x1, 0x1F, 0xFFFFFFF7FFFFFFFF, 0x88, + 0x1, 0x1F); + VSET(256, e8, m1); + asm volatile("vpopc.m %[A], v2 \n" + "sw %[A], (%1) \n" + : + : [A] "r"(scalar), "r"(OUP)); + XCMP(6, OUP[0], 80); + + VSET(2, e32, m1); + VLOAD_8(v2, 0xFF, 0x88); + VSET(16, e16, m1); + asm volatile("vcpop.m %[A], v2 \n" + "sw %[A], (%1) \n" + : + : [A] "r"(scalar), "r"(OUP)); + XCMP(7, OUP[0], 10); + + VSET(4, e32, m1); + VLOAD_32(v2, 0xF, 0, 0, 0); + asm volatile("vpopc.m %[A], v2 \n" + "sw %[A], (%1) \n" + : + : [A] "r"(scalar), "r"(OUP)); + XCMP(8, OUP[0], 4); } int main(void) { diff --git a/apps/riscv-tests/isa/rv64uv/vid.c b/apps/riscv-tests/isa/rv64uv/vid.c index 7db9a1fc5..796e3e166 100644 --- a/apps/riscv-tests/isa/rv64uv/vid.c +++ b/apps/riscv-tests/isa/rv64uv/vid.c @@ -11,6 +11,17 @@ void TEST_CASE1() { VSET(16, e8, m1); __asm__ volatile("vid.v v1"); VCMP_U8(1, v1, 0, 1, 2, 3, 4, 5, 6, 7); + VSET(10, e8, m1); + + VLOAD_8(v1, 0b00001110, 0b00000111, 0b00010000, 0b01001011, 0b00110100, + 0b01111101, 0b11001100, 0b00011000, 0b01000111, 0b00010100); + VSET(77, e8, m1); + asm volatile("vid.v v2"); + VCMP_U8(2, v2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, + 69, 70, 71, 72, 73, 74, 75, 76); } void TEST_CASE2() { @@ -18,7 +29,7 @@ void TEST_CASE2() { VLOAD_8(v0, 85, 0, 0, 0, 0, 0, 0, 0); VCLEAR(v1); __asm__ volatile("vid.v v1, v0.t"); - VCMP_U8(2, v1, 0, 0, 2, 0, 4, 0, 6, 0); + VCMP_U8(3, v1, 0, 0, 2, 0, 4, 0, 6, 0); } int main(void) { diff --git a/apps/riscv-tests/isa/rv64uv/viota.c b/apps/riscv-tests/isa/rv64uv/viota.c index 9fb17080f..bd8dbec6a 100644 --- a/apps/riscv-tests/isa/rv64uv/viota.c +++ b/apps/riscv-tests/isa/rv64uv/viota.c @@ -10,9 +10,208 @@ void TEST_CASE1() { VSET(1, e8, m1); VLOAD_8(v1, 0b10001001); - VSET(16, e8, m1); + VSET(8, e8, m1); asm volatile("viota.m v2, v1"); VCMP_U8(1, v2, 0, 1, 1, 1, 2, 2, 2, 2); + + VSET(2, e8, m1); + VLOAD_8(v1, 0b01100010, 0b01001100); + VSET(16, e8, m1); + asm volatile("viota.m v2, v1"); + VCMP_U8(2, v2, 0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 3, 4, 5, 5, 5, 6); + + VSET(1, e8, m8); + VLOAD_8(v0, 0b00000001); + VSET(4, e16, m8); + asm volatile("viota.m v8, v0"); + VCMP_U16(3, v8, 0, 1, 1, 1); + + VSET(64, e8, m4); + VLOAD_8( + v0, 0b10101101, 0b10000000, 0b00000110, 0b10011100, 0b10010101, + 0b01001100, 0b01101010, 0b11000100, 0b00011110, 0b10111010, 0b00100110, + 0b11001010, 0b01101101, 0b11001010, 0b01000101, 0b00010110, 0b00001000, + 0b10111000, 0b11011100, 0b11100000, 0b00110101, 0b10011110, 0b01001111, + 0b01011101, 0b00001010, 0b01111000, 0b11010100, 0b01011101, 0b11000101, + 0b10010010, 0b01011100, 0b11010101, 0b00100010, 0b10000100, 0b11001011, + 0b01001101, 0b01010000, 0b10110011, 0b00011000, 0b10000101, 0b01110101, + 0b00001111, 0b10111100, 0b00010101, 0b10011101, 0b11011001, 0b11010101, + 0b00100001, 0b01101110, 0b10000001, 0b01100100, 0b00010001, 0b00010100, + 0b00101011, 0b11111000, 0b10010000, 0b01010000, 0b01001111, 0b00000011, + 0b10100100, 0b10001010, 0b01110011, 0b10100010, 0b01111110); + VSET(512, e8, m4); + asm volatile("viota.m v8, v0"); + VCMP_U8( + 4, v8, 0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 7, 8, 8, 8, + 8, 8, 8, 8, 8, 9, 10, 11, 11, 11, 12, 13, 13, 14, 14, 15, 15, 15, 16, 16, + 16, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 22, 23, 23, 23, 23, 24, + 24, 24, 24, 25, 26, 26, 27, 28, 29, 30, 30, 30, 30, 30, 31, 31, 32, 33, + 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, + 42, 43, 43, 44, 45, 45, 46, 47, 47, 47, 48, 48, 49, 49, 49, 50, 51, 52, + 52, 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 57, 57, 57, 57, 57, 57, 57, + 58, 58, 58, 58, 58, 58, 58, 58, 59, 60, 61, 61, 62, 62, 62, 63, 64, 65, + 65, 66, 67, 67, 67, 67, 67, 67, 68, 69, 70, 71, 71, 72, 72, 73, 74, 74, + 74, 74, 75, 76, 77, 78, 78, 78, 79, 80, 81, 82, 83, 83, 83, 84, 84, 85, + 85, 86, 87, 88, 88, 89, 89, 89, 90, 90, 91, 91, 91, 91, 91, 91, 91, 91, + 92, 93, 94, 95, 95, 95, 95, 96, 96, 97, 97, 98, 99, 100, 100, 101, 102, + 103, 103, 104, 104, 105, 105, 106, 106, 106, 106, 107, 108, 108, 109, 109, + 109, 110, 110, 110, 111, 111, 111, 112, 113, 114, 114, 115, 115, 116, 116, + 117, 117, 118, 118, 119, 120, 120, 121, 121, 121, 121, 122, 122, 122, 122, + 122, 123, 123, 123, 123, 123, 124, 125, 126, 126, 127, 127, 127, 128, 129, + 130, 130, 131, 132, 132, 132, 133, 133, 133, 133, 133, 133, 134, 134, 135, + 135, 136, 137, 137, 137, 138, 139, 139, 140, 140, 140, 140, 141, 142, 142, + 142, 142, 143, 143, 144, 144, 144, 144, 144, 145, 146, 146, 147, 147, 148, + 149, 150, 150, 151, 152, 153, 154, 154, 154, 154, 154, 154, 154, 155, 156, + 157, 158, 158, 159, 160, 160, 161, 161, 162, 162, 162, 162, 163, 163, 164, + 165, 166, 166, 166, 167, 168, 168, 168, 169, 170, 170, 171, 172, 173, 173, + 174, 174, 175, 175, 176, 177, 178, 178, 178, 178, 178, 179, 179, 179, 179, + 180, 181, 182, 182, 183, 184, 184, 185, 185, 185, 185, 185, 185, 185, 186, + 186, 186, 187, 187, 187, 188, 189, 189, 190, 190, 190, 190, 191, 191, 191, + 191, 191, 191, 192, 192, 193, 193, 193, 193, 194, 195, 195, 196, 196, 197, + 197, 197, 197, 197, 197, 198, 199, 200, 201, 202, 202, 202, 202, 202, 203, + 203, 203, 204, 204, 204, 204, 204, 205, 205, 206, 206, 207, 208, 209, 210, + 210, 210, 211, 211, 212, 213, 213, 213, 213, 213, 213, 213, 213, 213, 214, + 214, 214, 215, 215, 216, 216, 217, 217, 218, 218, 218, 218, 219, 220, 221, + 221, 221, 222, 223, 224, 224, 224, 225, 225, 225, 225, 226, 226, 227, 227, + 228, 229, 230, 231, 232, 233); + + VSET(128, e8, m8); + VLOAD_8( + v0, 0b00000001, 0b00000011, 0b00111110, 0b00000100, 0b01101000, + 0b10111011, 0b11010110, 0b10111111, 0b00110011, 0b00011100, 0b11010100, + 0b00011010, 0b10100001, 0b10110100, 0b10010111, 0b01010100, 0b00011010, + 0b01101011, 0b00101010, 0b11111111, 0b10000100, 0b11100110, 0b00100001, + 0b01101000, 0b10110100, 0b01100010, 0b11100001, 0b10011100, 0b00110111, + 0b01010011, 0b01010111, 0b10010001, 0b11001000, 0b01001011, 0b01000000, + 0b10001111, 0b00001111, 0b01110100, 0b10101100, 0b00010101, 0b00110100, + 0b10010010, 0b00001101, 0b11110011, 0b10101101, 0b10000100, 0b01111000, + 0b11010101, 0b10110110, 0b00110110, 0b01010001, 0b01001000, 0b11100011, + 0b01001110, 0b11101101, 0b01111000, 0b10111101, 0b00111011, 0b10111001, + 0b11000110, 0b00000011, 0b00001110, 0b00001111, 0b00000010, 0b01010110, + 0b00000010, 0b11011011, 0b01010100, 0b10110110, 0b10100011, 0b10100101, + 0b11110101, 0b00000110, 0b10011111, 0b01000110, 0b00100000, 0b00100011, + 0b11110100, 0b10111101, 0b10000010, 0b11110011, 0b00111111, 0b11000010, + 0b00011001, 0b10000010, 0b00110011, 0b11000110, 0b11001100, 0b10011100, + 0b11001011, 0b10101101, 0b11011110, 0b11010110, 0b11010110, 0b00100100, + 0b01111010, 0b00111001, 0b10111000, 0b01101000, 0b00001001, 0b10010100, + 0b11111101, 0b00001101, 0b10100111, 0b11000110, 0b01100111, 0b01010111, + 0b10011001, 0b01100111, 0b00001011, 0b01001011, 0b10001101, 0b11110110, + 0b10001001, 0b10010101, 0b10010010, 0b10100100, 0b01010110, 0b10110110, + 0b10111001, 0b01010000, 0b01010001, 0b01001100, 0b11001101, 0b10111100, + 0b11110010, 0b00001000, 0b10111000); + VSET(1024, e16, m8); + asm volatile("viota.m v8, v0"); + VCMP_U16( + 5, v8, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 5, 6, 7, + 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 11, 12, 12, 13, 14, 14, + 15, 16, 17, 17, 18, 18, 19, 20, 20, 21, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 29, 30, 31, 32, 32, 32, 33, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, + 37, 37, 37, 38, 38, 39, 39, 40, 41, 41, 42, 42, 43, 44, 44, 44, 44, 45, + 45, 45, 45, 45, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 54, + 54, 55, 55, 55, 56, 56, 56, 57, 57, 58, 58, 59, 59, 59, 60, 60, 61, 62, + 62, 62, 62, 63, 64, 64, 65, 65, 66, 67, 67, 67, 68, 68, 69, 69, 70, 70, + 70, 71, 72, 73, 74, 75, 76, 77, 78, 78, 78, 79, 79, 79, 79, 79, 80, 80, + 81, 82, 82, 82, 83, 84, 85, 86, 86, 86, 86, 86, 87, 87, 87, 87, 87, 87, + 88, 88, 89, 90, 90, 90, 90, 91, 91, 92, 93, 93, 94, 94, 95, 95, 95, 95, + 96, 97, 97, 98, 98, 98, 98, 98, 99, 100, 101, 101, 101, 102, 103, 104, + 104, 104, 105, 106, 107, 108, 108, 109, 110, 110, 110, 111, 112, 112, 112, + 113, 113, 114, 114, 115, 116, 117, 117, 118, 118, 119, 119, 120, 120, 120, + 120, 121, 121, 121, 122, 122, 122, 122, 123, 123, 123, 124, 125, 126, 127, + 127, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 130, 130, 131, + 132, 133, 134, 134, 134, 134, 135, 136, 137, 138, 139, 139, 139, 139, 139, + 139, 139, 140, 140, 141, 142, 143, 143, 143, 143, 144, 145, 145, 146, 146, + 147, 148, 148, 149, 149, 150, 150, 150, 150, 150, 150, 151, 151, 152, 153, + 153, 153, 153, 154, 154, 154, 155, 155, 155, 156, 157, 157, 158, 159, 159, + 159, 159, 159, 160, 161, 161, 161, 162, 163, 164, 165, 166, 166, 167, 168, + 168, 169, 169, 170, 170, 170, 171, 171, 171, 171, 171, 172, 172, 172, 172, + 173, 174, 175, 176, 176, 177, 177, 178, 178, 179, 179, 180, 181, 181, 182, + 183, 183, 184, 185, 185, 186, 186, 187, 188, 188, 189, 190, 190, 190, 191, + 191, 191, 191, 192, 192, 193, 193, 193, 193, 193, 194, 194, 194, 195, 195, + 196, 197, 197, 197, 197, 198, 199, 200, 200, 201, 202, 203, 203, 203, 204, + 204, 205, 205, 206, 207, 207, 208, 209, 210, 210, 210, 210, 211, 212, 213, + 214, 214, 215, 215, 216, 217, 218, 219, 219, 220, 221, 222, 222, 223, 224, + 225, 225, 225, 226, 226, 226, 227, 228, 229, 229, 230, 230, 231, 232, 232, + 232, 232, 233, 234, 235, 236, 236, 236, 236, 236, 236, 236, 236, 237, 238, + 239, 239, 239, 239, 239, 240, 241, 242, 243, 243, 243, 243, 243, 243, 244, + 244, 244, 244, 244, 244, 244, 244, 245, 246, 246, 247, 247, 248, 248, 248, + 249, 249, 249, 249, 249, 249, 249, 250, 251, 251, 252, 253, 253, 254, 255, + 255, 255, 256, 256, 257, 257, 258, 258, 258, 259, 260, 260, 261, 262, 262, + 263, 264, 265, 265, 265, 265, 266, 266, 267, 268, 268, 269, 269, 269, 270, + 270, 271, 272, 272, 273, 273, 274, 275, 276, 277, 277, 278, 279, 279, 279, + 279, 279, 279, 280, 281, 282, 283, 284, 284, 284, 285, 285, 286, 287, 287, + 287, 287, 288, 288, 288, 288, 288, 288, 288, 289, 289, 289, 290, 291, 291, + 291, 291, 292, 292, 292, 292, 292, 293, 293, 294, 295, 296, 297, 298, 298, + 299, 300, 301, 302, 302, 303, 303, 304, 304, 304, 304, 304, 304, 305, 306, + 307, 307, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 317, 317, + 317, 318, 318, 318, 318, 318, 319, 320, 321, 321, 321, 322, 323, 323, 323, + 323, 323, 324, 324, 324, 324, 324, 324, 325, 326, 327, 327, 327, 328, 329, + 329, 329, 329, 330, 331, 331, 331, 331, 332, 333, 333, 333, 334, 335, 335, + 335, 336, 337, 337, 337, 338, 339, 340, 340, 340, 341, 342, 343, 343, 344, + 344, 344, 345, 346, 347, 347, 348, 349, 349, 350, 350, 351, 351, 352, 353, + 354, 355, 355, 356, 357, 357, 358, 359, 359, 360, 360, 361, 362, 362, 363, + 364, 364, 365, 365, 366, 367, 367, 367, 368, 368, 368, 369, 369, 369, 369, + 370, 370, 371, 372, 373, 374, 374, 375, 375, 375, 376, 377, 378, 378, 378, + 378, 378, 378, 379, 380, 381, 381, 382, 382, 382, 382, 383, 383, 384, 385, + 385, 386, 386, 386, 387, 387, 387, 387, 387, 387, 387, 388, 388, 389, 389, + 389, 390, 391, 391, 392, 393, 394, 395, 396, 397, 398, 398, 399, 400, 400, + 400, 400, 400, 401, 402, 403, 403, 403, 404, 404, 405, 405, 406, 407, 407, + 407, 407, 408, 409, 410, 411, 412, 412, 412, 413, 414, 414, 415, 416, 417, + 417, 418, 418, 419, 419, 420, 420, 420, 421, 422, 422, 422, 423, 424, 425, + 426, 426, 426, 427, 428, 428, 429, 430, 430, 431, 431, 431, 431, 431, 432, + 433, 433, 434, 434, 434, 435, 435, 436, 436, 437, 438, 438, 438, 438, 439, + 439, 440, 441, 441, 442, 443, 444, 445, 446, 446, 446, 447, 447, 447, 447, + 448, 449, 449, 450, 450, 451, 451, 451, 452, 452, 453, 453, 453, 454, 454, + 454, 455, 455, 455, 456, 456, 456, 457, 457, 458, 458, 459, 460, 460, 461, + 461, 462, 462, 462, 463, 464, 464, 465, 466, 466, 467, 468, 468, 468, 469, + 470, 471, 471, 472, 472, 472, 472, 472, 473, 473, 474, 474, 475, 475, 475, + 475, 476, 476, 477, 477, 477, 477, 478, 479, 479, 479, 480, 480, 481, 481, + 482, 483, 483, 483, 484, 485, 485, 485, 486, 487, 488, 489, 489, 490, 490, + 491, 491, 491, 492, 493, 494, 495, 495, 495, 495, 496, 496, 496, 496, 496, + 496, 496, 496, 497, 498, 499, 499); + + VSET(10, e8, m1); + VLOAD_8(v1, 0b00001110, 0b00000111, 0b00010000, 0b01001011, 0b00110100, + 0b01111101, 0b11001100, 0b00011000, 0b01000111, 0b00010100); + VSET(77, e8, m1); + asm volatile("viota.m v2, v1"); + VCMP_U8(6, v2, 0, 0, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 8, 9, 9, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 14, 14, + 14, 15, 15, 16, 17, 18, 19, 20, 20, 20, 20, 21, 22, 22, 22, 23, 24, + 24, 24, 24, 25, 26, 26, 26, 26, 27, 28, 29, 29, 29, 29, 30, 30, 30, + 30, 31, 31); + + VSET(10, e8, m1); + VLOAD_8(v0, 0b00001110, 0b00000111, 0b00010000, 0b01001011, 0b00110100, + 0b01111101, 0b11001100, 0b00011000, 0b01000111, 0b00010100); + VSET(77, e32, m2); + asm volatile("viota.m v2, v0"); + VCMP_U32(7, v2, 0, 0, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 8, 9, 9, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 14, 14, + 14, 15, 15, 16, 17, 18, 19, 20, 20, 20, 20, 21, 22, 22, 22, 23, 24, + 24, 24, 24, 25, 26, 26, 26, 26, 27, 28, 29, 29, 29, 29, 30, 30, 30, + 30, 31, 31); + + VSET(10, e8, m1); + VLOAD_8(v0, 0b00001110, 0b00000111, 0b00010000, 0b01001011, 0b00110100, + 0b01111101, 0b11001100, 0b00011000, 0b01000111, 0b00010100); + VSET(77, e64, m4); + asm volatile("viota.m v4, v0"); + VCMP_U64(8, v4, 0, 0, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 8, 9, 9, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 14, 14, + 14, 15, 15, 16, 17, 18, 19, 20, 20, 20, 20, 21, 22, 22, 22, 23, 24, + 24, 24, 24, 25, 26, 26, 26, 26, 27, 28, 29, 29, 29, 29, 30, 30, 30, + 30, 31, 31); + + VSET(5, e16, m1); + VLOAD_16(v0, 0b0000011100001110, 0b0100101100010000, 0b0111110100110100, + 0b0001100011001100, 0b0001010001000111); + VSET(77, e8, m4); + asm volatile("viota.m v4, v0"); + VCMP_U8(9, v4, 0, 0, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 8, 9, 9, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 14, 14, + 14, 15, 15, 16, 17, 18, 19, 20, 20, 20, 20, 21, 22, 22, 22, 23, 24, + 24, 24, 24, 25, 26, 26, 26, 26, 27, 28, 29, 29, 29, 29, 30, 30, 30, + 30, 31, 31); } void TEST_CASE2() { @@ -24,7 +223,51 @@ void TEST_CASE2() { VLOAD_8(v0, 0b11000111); VSET(16, e8, m1); asm volatile("viota.m v2, v1, v0.t"); - VCMP_U8(2, v2, 0, 1, 1, 3, 4, 5, 1, 1); + VCMP_U8(10, v2, 0, 1, 1, 3, 4, 5, 1, 1); + + VSET(5, e16, m1); + VLOAD_16(v8, 0b0000011100001110, 0b0100101100010000, 0b0111110100110100, + 0b0001100011001100, 0b0001010001000111); + VSET(10, e16, m1); + VLOAD_8(v0, 0b11111111, 0b11111111, 0b11111111, 0b11111111, 0b11111111, + 0b11111111, 0b11111111, 0b11111111, 0b11111111, 0b00001111); + VSET(77, e8, m4); + VCLEAR(v4); + asm volatile("viota.m v4, v8, v0.t"); + VCMP_U8(11, v4, 0, 0, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 8, 9, 9, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 14, 14, + 14, 15, 15, 16, 17, 18, 19, 20, 20, 20, 20, 21, 22, 22, 22, 23, 24, + 24, 24, 24, 25, 26, 26, 26, 26, 27, 28, 29, 29, 29, 29, 30, 30, 30, + 30, 31, 0); + + VSET(5, e16, m1); + VLOAD_16(v8, 0b0000011100001110, 0b0100101100010000, 0b0111110100110100, + 0b0001100011001100, 0b0001010001000111); + VSET(10, e16, m1); + VLOAD_8(v0, 0b11111111, 0b11111111, 0b11111111, 0b11111111, 0b11111111, + 0b11111111, 0b11111111, 0b11111111, 0b11111111, 0b00001011); + VSET(77, e8, m4); + VCLEAR(v4); + asm volatile("viota.m v4, v8, v0.t"); + VCMP_U8(12, v4, 0, 0, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 8, 9, 9, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 14, 14, + 14, 15, 15, 16, 17, 18, 19, 20, 20, 20, 20, 21, 22, 22, 22, 23, 24, + 24, 24, 24, 25, 26, 26, 26, 26, 27, 28, 29, 29, 29, 29, 30, 30, 30, 0, + 30, 0); + + VSET(5, e16, m1); + VLOAD_16(v8, 0b0000011100001110, 0b0100101100010000, 0b0111110100110100, + 0b0001100011001100, 0b0001010001000111); + VSET(10, e16, m1); + VLOAD_8(v0, 0b11111111, 0b11111111, 0b11111111, 0b11111111, 0b11111111, + 0b11111111, 0b11111111, 0b11111111, 0b11111111, 0b11111111); + VSET(77, e8, m4); + asm volatile("viota.m v4, v8, v0.t"); + VCMP_U8(13, v4, 0, 0, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 8, 9, 9, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 14, 14, + 14, 15, 15, 16, 17, 18, 19, 20, 20, 20, 20, 21, 22, 22, 22, 23, 24, + 24, 24, 24, 25, 26, 26, 26, 26, 27, 28, 29, 29, 29, 29, 30, 30, 30, + 30, 31, 31); } int main(void) { diff --git a/apps/riscv-tests/isa/rv64uv/vmsbf.c b/apps/riscv-tests/isa/rv64uv/vmsbf.c index 7a4e15e9f..eb92f3ddb 100644 --- a/apps/riscv-tests/isa/rv64uv/vmsbf.c +++ b/apps/riscv-tests/isa/rv64uv/vmsbf.c @@ -10,17 +10,141 @@ void TEST_CASE1() { VSET(16, e8, m1); VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0); - __asm__ volatile("vmsbf.m v2, v3"); - VCMP_U8(1, v2, 7, 0, 0, 0, 0, 0, 0, 0); + asm volatile("vmsbf.m v2, v3"); + VSET(2, e8, m1); + VCMP_U8(1, v2, 7, 0); } void TEST_CASE2() { + VSET(16, e8, m1); + VLOAD_8(v3, 4, 0, 0, 0, 0, 0, 0, 0); + asm volatile("vmsbf.m v2, v3"); + VSET(2, e8, m1); + VCMP_U8(2, v2, 3, 0); +} + +void TEST_CASE3() { + VSET(16, e8, m1); + VLOAD_8(v3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + asm volatile("vmsbf.m v2, v3"); + VSET(2, e8, m1); + VCMP_U8(3, v2, 0xff, 0xff); + + VSET(16, e8, m1); + VLOAD_8(v3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + asm volatile("vmsbf.m v2, v3"); + VSET(2, e8, m1); + VCMP_U8(4, v2, 0, 0); + + VSET(16, e8, m1); + VLOAD_8(v3, 0x08, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + VSET(16, e32, m1); + asm volatile("vmsbf.m v2, v3"); + VSET(2, e8, m1); + VCMP_U8(5, v2, 0x07, 0x00); + + VSET(8, e64, m2); + VLOAD_64(v4, 0, 0, 0, 0, 0, 1, 0, 0); + VSET(512, e8, m2); + asm volatile("vmsbf.m v2, v4"); + VSET(16, e32, m2); + VCMP_U32(6, v2, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, + 0, 0, 0); + + VSET(16, e64, m2); + VLOAD_64(v4, 0, 0, 0, 0, 0, 1, 0, 0, 1685, 0, 0, 1, 0, 0, 0, 0); + VSET(1024, e8, m2); + asm volatile("vmsbf.m v2, v4"); + VSET(32, e32, m2); + VCMP_U32(7, v2, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +void TEST_CASE4() { VSET(16, e8, m1); VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0); VLOAD_8(v0, 3, 0, 0, 0, 0, 0, 0, 0); VCLEAR(v2); - __asm__ volatile("vmsbf.m v2, v3, v0.t"); - VCMP_U8(2, v2, 3, 0, 0, 0, 0, 0, 0, 0); + VSET(2, e8, m1); + asm volatile("vmsbf.m v2, v3, v0.t"); + VCMP_U8(8, v2, 3, 0); + + VSET(16, e8, m1); + VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0); + VLOAD_8(v0, 5, 0, 0, 0, 0, 0, 0, 0); + VCLEAR(v2); + VSET(16, e8, m1); + asm volatile("vmsbf.m v2, v3, v0.t"); + VSET(2, e8, m1); + VCMP_U8(9, v2, 5, 0); + + VSET(16, e8, m1); + VLOAD_8(v3, 0x18, 0, 0, 0, 0, 0, 0, 0); + VLOAD_8(v0, 0xf7, 0, 0, 0, 0, 0, 0, 0); + VCLEAR(v2); + asm volatile("vmsbf.m v2, v3, v0.t"); + VSET(2, e8, m1); + VCMP_U8(10, v2, 0x07, 0); + + VSET(16, e8, m1); + VLOAD_8(v3, 0x18, 0, 0, 0, 0, 0, 0, 0); + VLOAD_8(v0, 0xef, 0, 0, 0, 0, 0, 0, 0); + VCLEAR(v2); + asm volatile("vmsbf.m v2, v3, v0.t"); + VSET(2, e8, m1); + VCMP_U8(11, v2, 0x07, 0); + + VSET(16, e8, m1); + VLOAD_8(v3, 0x8, 0, 0, 0, 0, 0, 0, 0); + VLOAD_8(v0, 0xf7, 0xff, 0, 0, 0, 0, 0, 0); + VCLEAR(v2); + asm volatile("vmsbf.m v2, v3, v0.t"); + VSET(2, e8, m1); + VCMP_U8(12, v2, 0xf7, 0xff); + + VSET(16, e8, m1); + VLOAD_8(v3, 0x38, 0, 0, 0, 0, 0, 0, 0); + VLOAD_8(v0, 0xf7, 0, 0, 0, 0, 0, 0, 0); + VCLEAR(v2); + asm volatile("vmsbf.m v2, v3, v0.t"); + VSET(2, e8, m1); + VCMP_U8(13, v2, 0x7, 0); + + VSET(16, e8, m1); + VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0); + VLOAD_8(v0, 5, 0, 0, 0, 0, 0, 0, 0); + VCLEAR(v2); + VSET(16, e8, m1); + asm volatile("vmsbf.m v2, v3, v0.t"); + VCMP_U8(14, v2, 5, 0); +} + +void TEST_CASE5() { + VSET(16, e8, m1); + VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0); + VLOAD_8(v0, 11, 0, 0, 0, 0, 0, 0, 0); + VCLEAR(v2); + asm volatile("vmsbf.m v2, v3, v0.t"); + VSET(2, e8, m1); + VCMP_U8(15, v2, 3, 0); + + VSET(16, e8, m1); + VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0); + VLOAD_8(v0, 11, 0, 0, 0, 0, 0, 0, 0); + VCLEAR_AT_ONE(v2); + asm volatile("vmsbf.m v2, v3, v0.t"); + VSET(2, e8, m1); + VCMP_U8(16, v2, 0xf7, 0xff); + + VSET(8, e8, m1); + VLOAD_8(v3, 0x94, 0, 0, 0, 0, 0, 0, 0); + VLOAD_8(v0, 0xC3, 0, 0, 0, 0, 0, 0, 0); + VCLEAR(v2); + asm volatile("vmsbf.m v2, v3, v0.t"); + VSET(1, e8, m1); + VCMP_U8(17, v2, 0x43); } int main(void) { @@ -29,5 +153,8 @@ int main(void) { enable_fp(); TEST_CASE1(); TEST_CASE2(); + TEST_CASE3(); + TEST_CASE4(); + TEST_CASE5(); EXIT_CHECK(); } diff --git a/apps/riscv-tests/isa/rv64uv/vmseq.c b/apps/riscv-tests/isa/rv64uv/vmseq.c index 582ba8f81..b881857d6 100644 --- a/apps/riscv-tests/isa/rv64uv/vmseq.c +++ b/apps/riscv-tests/isa/rv64uv/vmseq.c @@ -291,6 +291,64 @@ void TEST_CASE6(void) { VCMP_U8(24, v1, 0x10, 0x10); }; +void TEST_CASE7(void) { + VSET(16, e8, m1); + VLOAD_8(v2, 0xff, 0x00, 0xf0, 0x0f, 0xff, 0x00, 0xf0, 0x0f, 0xff, 0x00, 0xf0, + 0x0f, 0xff, 0x00, 0xf0, 0x0f); + VLOAD_8(v3, 0xf2, 0x01, 0xf0, 0x0f, 0xf2, 0x01, 0xf0, 0x0f, 0xf2, 0x01, 0xf0, + 0x0f, 0xf2, 0x01, 0xf0, 0x0f); + VLOAD_8(v0, 0xaa, 0xaa); + VCLEAR_AT_ONE(v1); + asm volatile("vmseq.vv v1, v2, v3, v0.t"); + VSET(2, e8, m1); + VCMP_U8(25, v1, 0xdd, 0xdd); + + VSET(16, e16, m1); + VLOAD_16(v2, 0xffff, 0x0000, 0xf0f0, 0x0f0f, 0xffff, 0x0000, 0xf0f0, 0x0f0f, + 0xffff, 0x0000, 0xf0f0, 0x0f0f, 0xffff, 0x0000, 0xf0f0, 0x0f0f); + VLOAD_16(v3, 0xf2ff, 0x0100, 0xf0f0, 0x0f0f, 0xf2ff, 0x0100, 0xf0f0, 0x0f0f, + 0xf2ff, 0x0100, 0xf0f0, 0x0f0f, 0xf2ff, 0x0100, 0xf0f0, 0x0f0f); + VLOAD_8(v0, 0xaa, 0xaa); + VCLEAR_AT_ONE(v1); + asm volatile("vmseq.vv v1, v2, v3, v0.t"); + VSET(2, e8, m1); + VCMP_U8(26, v1, 0xdd, 0xdd); + + VSET(16, e32, m1); + VLOAD_32(v2, 0xffffffff, 0x00000000, 0xf0f0f0f0, 0x0f0f0f0f, 0xffffffff, + 0x00000000, 0xf0f0f0f0, 0x0f0f0f0f, 0xffffffff, 0x00000000, + 0xf0f0f0f0, 0x0f0f0f0f, 0xffffffff, 0x00000000, 0xf0f0f0f0, + 0x0f0f0f0f); + VLOAD_32(v3, 0xfff2ffff, 0x01000000, 0xf0f0f0f0, 0x0f0f0f0f, 0xfff2ffff, + 0x01000000, 0xf0f0f0f0, 0x0f0f0f0f, 0xfff2ffff, 0x01000000, + 0xf0f0f0f0, 0x0f0f0f0f, 0xfff2ffff, 0x01000000, 0xf0f0f0f0, + 0x0f0f0f0f); + VLOAD_8(v0, 0xaa, 0xaa); + VCLEAR_AT_ONE(v1); + asm volatile("vmseq.vv v1, v2, v3, v0.t"); + VSET(2, e8, m1); + VCMP_U8(27, v1, 0xdd, 0xdd); + + VSET(16, e64, m1); + VLOAD_64(v2, 0xffffffffffffffff, 0x0000000000000000, 0xf0f0f0f0f0f0f0f0, + 0x0f0f0f0f0f0f0f0f, 0xffffffffffffffff, 0x0000000000000000, + 0xf0f0f0f0f0f0f0f0, 0x0f0f0f0f0f0f0f0f, 0xffffffffffffffff, + 0x0000000000000000, 0xf0f0f0f0f0f0f0f0, 0x0f0f0f0f0f0f0f0f, + 0xffffffffffffffff, 0x0000000000000000, 0xf0f0f0f0f0f0f0f0, + 0x0f0f0f0f0f0f0f0f); + VLOAD_64(v3, 0xfff2ffffffffffff, 0x0100000000000000, 0xf0f0f0f0f0f0f0f0, + 0x0f0f0f0f0f0f0f0f, 0xfff2ffffffffffff, 0x0100000000000000, + 0xf0f0f0f0f0f0f0f0, 0x0f0f0f0f0f0f0f0f, 0xfff2ffffffffffff, + 0x0100000000000000, 0xf0f0f0f0f0f0f0f0, 0x0f0f0f0f0f0f0f0f, + 0xfff2ffffffffffff, 0x0100000000000000, 0xf0f0f0f0f0f0f0f0, + 0x0f0f0f0f0f0f0f0f); + VLOAD_8(v0, 0xaa, 0xaa); + VCLEAR_AT_ONE(v1); + asm volatile("vmseq.vv v1, v2, v3, v0.t"); + VSET(2, e8, m1); + VCMP_U8(28, v1, 0xdd, 0xdd); +}; + int main(void) { INIT_CHECK(); enable_vec(); @@ -301,6 +359,7 @@ int main(void) { TEST_CASE4(); TEST_CASE5(); TEST_CASE6(); + TEST_CASE7(); EXIT_CHECK(); } diff --git a/apps/riscv-tests/isa/rv64uv/vmsif.c b/apps/riscv-tests/isa/rv64uv/vmsif.c index 7f682bba3..24a5f3fd3 100644 --- a/apps/riscv-tests/isa/rv64uv/vmsif.c +++ b/apps/riscv-tests/isa/rv64uv/vmsif.c @@ -10,17 +10,58 @@ void TEST_CASE1() { VSET(16, e8, m1); VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0); - __asm__ volatile("vmsif.m v2, v3"); - VCMP_U8(1, v2, 15, 0, 0, 0, 0, 0, 0, 0); + asm volatile("vmsif.m v2, v3"); + VSET(2, e8, m1); + VCMP_U8(1, v2, 15, 0); } void TEST_CASE2() { + VSET(16, e8, m1); + VLOAD_8(v3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + asm volatile("vmsif.m v2, v3"); + VSET(2, e8, m1); + VCMP_U8(2, v2, 0xff, 0xff); + + VSET(16, e8, m1); + VLOAD_8(v3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + asm volatile("vmsif.m v2, v3"); + VSET(2, e8, m1); + VCMP_U8(3, v2, 1, 0); + + VSET(16, e8, m1); + VLOAD_8(v3, 0x08, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + VSET(16, e32, m1); + asm volatile("vmsif.m v2, v3"); + VSET(2, e8, m1); + VCMP_U8(4, v2, 0x0F, 0x00); + + VSET(8, e64, m2); + VLOAD_64(v4, 0, 0, 0, 0, 0, 1, 0, 0); + VSET(512, e8, m2); + asm volatile("vmsif.m v2, v4"); + VSET(16, e32, m2); + VCMP_U32(5, v2, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 1, 0, 0, + 0, 0, 0); + + VSET(16, e64, m2); + VLOAD_64(v4, 0, 0, 0, 0, 0, 1, 0, 0, 1685, 0, 0, 1, 0, 0, 0, 0); + VSET(1024, e8, m4); + asm volatile("vmsif.m v0, v4"); + VSET(32, e32, m2); + VCMP_U32(6, v0, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +void TEST_CASE3() { VSET(16, e8, m1); VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0); VLOAD_8(v0, 11, 0, 0, 0, 0, 0, 0, 0); VCLEAR(v2); - __asm__ volatile("vmsif.m v2, v3, v0.t"); - VCMP_U8(2, v2, 11, 0, 0, 0, 0, 0, 0, 0); + asm volatile("vmsif.m v2, v3, v0.t"); + VSET(2, e8, m1); + VCMP_U8(7, v2, 11, 0, 0, 0, 0, 0, 0, 0); } int main(void) { @@ -29,5 +70,6 @@ int main(void) { enable_fp(); TEST_CASE1(); TEST_CASE2(); + TEST_CASE3(); EXIT_CHECK(); } diff --git a/apps/riscv-tests/isa/rv64uv/vmsof.c b/apps/riscv-tests/isa/rv64uv/vmsof.c index b5dc5aae1..24db47531 100644 --- a/apps/riscv-tests/isa/rv64uv/vmsof.c +++ b/apps/riscv-tests/isa/rv64uv/vmsof.c @@ -10,8 +10,9 @@ void TEST_CASE1() { VSET(16, e8, m1); VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0); - __asm__ volatile("vmsof.m v2, v3"); - VCMP_U8(1, v2, 8, 0, 0, 0, 0, 0, 0, 0); + asm volatile("vmsof.m v2, v3"); + VSET(2, e8, m1); + VCMP_U8(1, v2, 8, 0); } void TEST_CASE2() { @@ -19,8 +20,17 @@ void TEST_CASE2() { VLOAD_8(v3, 0, 0, 0, 1, 0, 0, 0, 0); VLOAD_8(v0, 3, 0, 0, 0, 0, 0, 0, 0); VCLEAR(v2); - __asm__ volatile("vmsof.m v2, v3, v0.t"); - VCMP_U8(2, v2, 0, 0, 0, 0, 0, 0, 0, 0); + asm volatile("vmsof.m v2, v3, v0.t"); + VSET(2, e8, m1); + VCMP_U8(2, v2, 0, 0); + + VSET(16, e8, m1); + VLOAD_8(v3, 0x38, 0, 0, 0, 0, 0, 0, 0); + VLOAD_8(v0, 0xf7, 0, 0, 0, 0, 0, 0, 0); + VCLEAR(v2); + asm volatile("vmsof.m v2, v3, v0.t"); + VSET(2, e8, m1); + VCMP_U8(3, v2, 0x10, 0); } int main(void) { diff --git a/apps/script/viota.py b/apps/script/viota.py new file mode 100755 index 000000000..d74327252 --- /dev/null +++ b/apps/script/viota.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +import random +import sys + +def generate_bit_vector(size): + # Generate a random bit vector I of size 'size' + I = [random.randint(0, 1) for _ in range(size)] + + # Initialize the accumulator and the output vector O + accumulator = 0 + O = [] + + # Compute each element of O based on I and the accumulator + for bit in I: + O.append(accumulator) + accumulator += bit + + return I, O + +def format_I_vector_as_binary(I, size): + # Format the I vector in chunks of 8 bits in reverse order + bin_chunks = [ + "0b" + "".join(str(bit) for bit in I[i:i+8][::-1]) + for i in range(0, size, 8) + ] + return ", ".join(bin_chunks) + +def format_O_vector(O): + # Format the O vector as individual elements + return ", ".join(f"{val}" for val in O) + +def generate_test_case(size): + # Generate I and O vectors + I, O = generate_bit_vector(size) + + # Format I as binary strings in chunks of 8 bits and O as individual elements + I_formatted = format_I_vector_as_binary(I, size) + O_formatted = format_O_vector(O) + + # Prepare the test case template + test_case = f""" +void TEST_CASE1() {{ + VSET({int(size/8)}, e8, m1); + VLOAD_8(v1, {I_formatted}); + VSET({size}, e8, m1); + asm volatile("viota.m v2, v1"); + VCMP_U8(1, v2, {O_formatted}); +}} +""" + + return test_case + +# Example of using the function +if __name__ == "__main__": + size = int(sys.argv[1]) + print(generate_test_case(size)) From 7015639c194b704d7f136861cea67e968cfa59ff Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Sat, 9 Nov 2024 14:25:17 +0100 Subject: [PATCH 6/8] [hardware] Refactor the MASKU Comment: the lanes are not synchronized when sending operands. Therefore, the spill regs need to handshake the lanes individually. --- hardware/include/ara_pkg.sv | 8 +- hardware/include/rvv_pkg.sv | 7 + hardware/src/ara_dispatcher.sv | 352 ++++-- hardware/src/lane/lane_sequencer.sv | 218 ++-- hardware/src/lane/operand_requester.sv | 18 +- hardware/src/lane/simd_alu.sv | 7 +- hardware/src/lane/valu.sv | 119 +- hardware/src/lane/vmfpu.sv | 13 +- hardware/src/masku/masku.sv | 1412 +++++++++++++----------- hardware/src/masku/masku_operands.sv | 181 +-- 10 files changed, 1362 insertions(+), 973 deletions(-) diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index 6fa695a7f..d071463f4 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -140,9 +140,15 @@ package ara_pkg; // Floating-point comparison instructions VMFEQ, VMFLE, VMFLT, VMFNE, VMFGT, VMFGE, // Integer comparison instructions - VMSEQ, VMSNE, VMSLTU, VMSLT, VMSLEU, VMSLE, VMSGTU, VMSBF, VMSOF, VMSIF, VIOTA, VID, VCPOP, VFIRST, VMSGT, + VMSEQ, VMSNE, VMSLTU, VMSLT, VMSLEU, VMSLE, VMSGTU, VMSGT, // Integer add-with-carry and subtract-with-borrow carry-out instructions VMADC, VMSBC, + // Mask to mask + VMSBF, VMSOF, VMSIF, + // Mask to non-mask + VIOTA, VID, + // Mask to scalar + VCPOP, VFIRST, // Mask operations VMANDNOT, VMAND, VMOR, VMXOR, VMORNOT, VMNAND, VMNOR, VMXNOR, // Scalar moves from VRF diff --git a/hardware/include/rvv_pkg.sv b/hardware/include/rvv_pkg.sv index 12a859408..7120ac295 100644 --- a/hardware/include/rvv_pkg.sv +++ b/hardware/include/rvv_pkg.sv @@ -161,4 +161,11 @@ package rvv_pkg; // The mask register is always vreg[0] localparam VMASK = 5'b00000; + ///////////////////////// + // VLEN restrictions // + ///////////////////////// + + // RISC-V Maximum VLEN == 64Ki + localparam int unsigned RISCV_MAX_VLEN = 1 << 16; + endpackage : rvv_pkg diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index 022a35a89..1915c47db 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -69,7 +69,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( `FF(csr_vstart_q, csr_vstart_d, '0) `FF(csr_vl_q, csr_vl_d, '0) - `FF(csr_vtype_q, csr_vtype_d, '{vill: 1'b1, default: '0}) + `FF(csr_vtype_q, csr_vtype_d, '{vill: 1'b1, vsew: EW8, vlmul: LMUL_1, default: '0}) `FF(csr_vxsat_q, csr_vxsat_d, '0) `FF(csr_vxrm_q, csr_vxrm_d, '0) // Converts between the internal representation of `vtype_t` and the full XLEN-bit CSR. @@ -337,6 +337,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( null_vslideup = 1'b0; + vfmvfs_result = ara_resp_i.resp; + is_decoding = 1'b0; in_lane_op = 1'b0; @@ -551,7 +553,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( (csr_vtype_d.vlmul == LMUL_RSVD) || // reserved value // LMUL >= SEW/ELEN (signed'($clog2(ELENB)) + signed'(csr_vtype_d.vlmul) < signed'(csr_vtype_d.vsew))) begin - csr_vtype_d = '{vill: 1'b1, default: '0}; + csr_vtype_d = '{vill: 1'b1, vsew: EW8, vlmul: LMUL_1, default: '0}; csr_vl_d = '0; end @@ -684,22 +686,52 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase end 6'b011000: begin - ara_req.op = ara_pkg::VMSEQ; + ara_req.op = ara_pkg::VMSEQ; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011001: begin ara_req.op = ara_pkg::VMSNE; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011010: begin ara_req.op = ara_pkg::VMSLTU; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011011: begin ara_req.op = ara_pkg::VMSLT; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011100: begin ara_req.op = ara_pkg::VMSLEU; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011101: begin ara_req.op = ara_pkg::VMSLE; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b010111: begin ara_req.op = ara_pkg::VMERGE; @@ -908,28 +940,68 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase end 6'b011000: begin - ara_req.op = ara_pkg::VMSEQ; + ara_req.op = ara_pkg::VMSEQ; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011001: begin ara_req.op = ara_pkg::VMSNE; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011010: begin ara_req.op = ara_pkg::VMSLTU; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011011: begin ara_req.op = ara_pkg::VMSLT; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011100: begin ara_req.op = ara_pkg::VMSLEU; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011101: begin ara_req.op = ara_pkg::VMSLE; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011110: begin ara_req.op = ara_pkg::VMSGTU; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011111: begin ara_req.op = ara_pkg::VMSGT; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b010111: begin ara_req.op = ara_pkg::VMERGE; @@ -1078,22 +1150,52 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase end 6'b011000: begin - ara_req.op = ara_pkg::VMSEQ; + ara_req.op = ara_pkg::VMSEQ; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011001: begin ara_req.op = ara_pkg::VMSNE; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011100: begin ara_req.op = ara_pkg::VMSLEU; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011101: begin ara_req.op = ara_pkg::VMSLE; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011110: begin ara_req.op = ara_pkg::VMSGTU; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011111: begin ara_req.op = ara_pkg::VMSGT; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b010111: begin ara_req.op = ara_pkg::VMERGE; @@ -1282,11 +1384,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 5'b10000: begin ara_req.op = ara_pkg::VCPOP; - ara_req.use_vs1 = 1'b0; + ara_req.eew_vs2 = eew_q[ara_req.vs2]; end 5'b10001: begin ara_req.op = ara_pkg::VFIRST; - ara_req.use_vs1 = 1'b0; + ara_req.eew_vs2 = eew_q[ara_req.vs2]; end default :; endcase @@ -1320,14 +1422,40 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end end 6'b010100: begin - ara_req.use_vd_op = 1'b1; - ara_req.use_vs1 = 1'b0; + // VMSBF, -OF, -IF, require bit-level masking + // vd is fetched for correct mask undisturbed + ara_req.use_vs1 = 1'b0; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs2 = eew_q[ara_req.vs2]; // Force reshuffle + ara_req.eew_vd_op = eew_q[ara_req.vd]; case (insn.varith_type.rs1) - 5'b00001: ara_req.op = ara_pkg::VMSBF; - 5'b00010: ara_req.op = ara_pkg::VMSOF; - 5'b00011: ara_req.op = ara_pkg::VMSIF; - 5'b10000: ara_req.op = ara_pkg::VIOTA; - 5'b10001: ara_req.op = ara_pkg::VID; + 5'b00001: begin + ara_req.op = ara_pkg::VMSBF; + // This is a mask-to-mask operation, vsew does not have any meaning + // So, avoid reshuffling + ara_req.vtype.vsew = eew_q[ara_req.vd]; + end + 5'b00010: begin + ara_req.op = ara_pkg::VMSOF; + // This is a mask-to-mask operation, vsew does not have any meaning + // So, avoid reshuffling + ara_req.vtype.vsew = eew_q[ara_req.vd]; + end + 5'b00011: begin + ara_req.op = ara_pkg::VMSIF; + // This is a mask-to-mask operation, vsew does not have any meaning + // So, avoid reshuffling + ara_req.vtype.vsew = eew_q[ara_req.vd]; + end + 5'b10000: begin + ara_req.op = ara_pkg::VIOTA; + ara_req.use_vd_op = 1'b0; + end + 5'b10001: begin + ara_req.op = ara_pkg::VID; + ara_req.use_vd_op = 1'b0; + ara_req.use_vs2 = 1'b0; + end endcase end 6'b001000: ara_req.op = ara_pkg::VAADDU; @@ -1335,63 +1463,61 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001010: ara_req.op = ara_pkg::VASUBU; 6'b001011: ara_req.op = ara_pkg::VASUB; 6'b011000: begin - ara_req.op = ara_pkg::VMANDNOT; - // Prefer mask operation on EW8 encoding - // In mask operations, vs1, vs2, vd should - // have the same encoding. - ara_req.eew_vs1 = EW8; - ara_req.eew_vs2 = EW8; - ara_req.eew_vd_op = EW8; - ara_req.vtype.vsew = EW8; + ara_req.op = ara_pkg::VMANDNOT; + // The source operands should have the same byte encoding + // Minimize reshuffling on mask operations + ara_req.eew_vs1 = eew_q[ara_req.vs1]; + ara_req.eew_vs2 = eew_q[ara_req.vs1]; // Force reshuffle + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011001: begin ara_req.op = ara_pkg::VMAND; - ara_req.eew_vs1 = EW8; - ara_req.eew_vs2 = EW8; - ara_req.eew_vd_op = EW8; - ara_req.vtype.vsew = EW8; + ara_req.eew_vs1 = eew_q[ara_req.vs1]; + ara_req.eew_vs2 = eew_q[ara_req.vs1]; // Force reshuffle + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011010: begin ara_req.op = ara_pkg::VMOR; - ara_req.eew_vs1 = EW8; - ara_req.eew_vs2 = EW8; - ara_req.eew_vd_op = EW8; - ara_req.vtype.vsew = EW8; + ara_req.eew_vs1 = eew_q[ara_req.vs1]; + ara_req.eew_vs2 = eew_q[ara_req.vs1]; // Force reshuffle + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011011: begin ara_req.op = ara_pkg::VMXOR; - ara_req.eew_vs1 = EW8; - ara_req.eew_vs2 = EW8; - ara_req.eew_vd_op = EW8; - ara_req.vtype.vsew = EW8; + ara_req.eew_vs1 = eew_q[ara_req.vs1]; + ara_req.eew_vs2 = eew_q[ara_req.vs1]; // Force reshuffle + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011100: begin ara_req.op = ara_pkg::VMORNOT; - ara_req.eew_vs1 = EW8; - ara_req.eew_vs2 = EW8; - ara_req.eew_vd_op = EW8; - ara_req.vtype.vsew = EW8; + ara_req.eew_vs1 = eew_q[ara_req.vs1]; + ara_req.eew_vs2 = eew_q[ara_req.vs1]; // Force reshuffle + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011101: begin ara_req.op = ara_pkg::VMNAND; - ara_req.eew_vs1 = EW8; - ara_req.eew_vs2 = EW8; - ara_req.eew_vd_op = EW8; - ara_req.vtype.vsew = EW8; + ara_req.eew_vs1 = eew_q[ara_req.vs1]; + ara_req.eew_vs2 = eew_q[ara_req.vs1]; // Force reshuffle + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011110: begin ara_req.op = ara_pkg::VMNOR; - ara_req.eew_vs1 = EW8; - ara_req.eew_vs2 = EW8; - ara_req.eew_vd_op = EW8; - ara_req.vtype.vsew = EW8; + ara_req.eew_vs1 = eew_q[ara_req.vs1]; + ara_req.eew_vs2 = eew_q[ara_req.vs1]; // Force reshuffle + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b011111: begin ara_req.op = ara_pkg::VMXNOR; - ara_req.eew_vs1 = EW8; - ara_req.eew_vs2 = EW8; - ara_req.eew_vd_op = EW8; - ara_req.vtype.vsew = EW8; + ara_req.eew_vs1 = eew_q[ara_req.vs1]; + ara_req.eew_vs2 = eew_q[ara_req.vs1]; // Force reshuffle + ara_req.vtype.vsew = eew_q[ara_req.vd]; end 6'b010010: begin // VXUNARY0 // These instructions do not use vs1 @@ -1985,10 +2111,38 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_valid = 1'b0; end end - 6'b011000: ara_req.op = ara_pkg::VMFEQ; - 6'b011001: ara_req.op = ara_pkg::VMFLE; - 6'b011011: ara_req.op = ara_pkg::VMFLT; - 6'b011100: ara_req.op = ara_pkg::VMFNE; + 6'b011000: begin + ara_req.op = ara_pkg::VMFEQ; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; + end + 6'b011001: begin + ara_req.op = ara_pkg::VMFLE; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; + end + 6'b011011: begin + ara_req.op = ara_pkg::VMFLT; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; + end + 6'b011100: begin + ara_req.op = ara_pkg::VMFNE; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; + end 6'b010010: begin // VFUNARY0 // These instructions do not use vs1 ara_req.use_vs1 = 1'b0; @@ -2284,20 +2438,20 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Ara can support 16-bit float, 32-bit float, 64-bit float. // Ara cannot support instructions who operates on more than 64 bits. unique case (FPUSupport) - FPUSupportHalfSingleDouble: if (int'(ara_req.vtype.vsew) < int'(EW16) || - int'(ara_req.vtype.vsew) > int'(EW64) || int'(ara_req.eew_vs2) > int'(EW64)) + FPUSupportHalfSingleDouble: if (int'(csr_vtype_q.vsew) < int'(EW16) || + int'(csr_vtype_q.vsew) > int'(EW64) || int'(ara_req.eew_vs2) > int'(EW64)) illegal_insn = 1'b1; - FPUSupportHalfSingle: if (int'(ara_req.vtype.vsew) < int'(EW16) || - int'(ara_req.vtype.vsew) > int'(EW32) || int'(ara_req.eew_vs2) > int'(EW32)) + FPUSupportHalfSingle: if (int'(csr_vtype_q.vsew) < int'(EW16) || + int'(csr_vtype_q.vsew) > int'(EW32) || int'(ara_req.eew_vs2) > int'(EW32)) illegal_insn = 1'b1; - FPUSupportSingleDouble: if (int'(ara_req.vtype.vsew) < int'(EW32) || - int'(ara_req.vtype.vsew) > int'(EW64) || int'(ara_req.eew_vs2) > int'(EW64)) + FPUSupportSingleDouble: if (int'(csr_vtype_q.vsew) < int'(EW32) || + int'(csr_vtype_q.vsew) > int'(EW64) || int'(ara_req.eew_vs2) > int'(EW64)) illegal_insn = 1'b1; - FPUSupportHalf: if (int'(ara_req.vtype.vsew) != int'(EW16) || int'(ara_req.eew_vs2) > int'(EW16)) + FPUSupportHalf: if (int'(csr_vtype_q.vsew) != int'(EW16) || int'(ara_req.eew_vs2) > int'(EW16)) illegal_insn = 1'b1; - FPUSupportSingle: if (int'(ara_req.vtype.vsew) != int'(EW32) || int'(ara_req.eew_vs2) > int'(EW32)) + FPUSupportSingle: if (int'(csr_vtype_q.vsew) != int'(EW32) || int'(ara_req.eew_vs2) > int'(EW32)) illegal_insn = 1'b1; - FPUSupportDouble: if (int'(ara_req.vtype.vsew) != int'(EW64) || int'(ara_req.eew_vs2) > int'(EW64)) + FPUSupportDouble: if (int'(csr_vtype_q.vsew) != int'(EW64) || int'(ara_req.eew_vs2) > int'(EW64)) illegal_insn = 1'b1; default: illegal_insn = 1'b1; // Unsupported configuration endcase @@ -2365,12 +2519,54 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( skip_lmul_checks = 1'b1; end 6'b010111: ara_req.op = ara_pkg::VMERGE; - 6'b011000: ara_req.op = ara_pkg::VMFEQ; - 6'b011001: ara_req.op = ara_pkg::VMFLE; - 6'b011011: ara_req.op = ara_pkg::VMFLT; - 6'b011100: ara_req.op = ara_pkg::VMFNE; - 6'b011101: ara_req.op = ara_pkg::VMFGT; - 6'b011111: ara_req.op = ara_pkg::VMFGE; + 6'b011000: begin + ara_req.op = ara_pkg::VMFEQ; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; + end + 6'b011001: begin + ara_req.op = ara_pkg::VMFLE; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; + end + 6'b011011: begin + ara_req.op = ara_pkg::VMFLT; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; + end + 6'b011100: begin + ara_req.op = ara_pkg::VMFNE; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; + end + 6'b011101: begin + ara_req.op = ara_pkg::VMFGT; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; + end + 6'b011111: begin + ara_req.op = ara_pkg::VMFGE; + ara_req.use_vd_op = 1'b1; + ara_req.eew_vs1 = csr_vtype_q.vsew; + ara_req.eew_vs2 = csr_vtype_q.vsew; + ara_req.eew_vd_op = eew_q[ara_req.vd]; + ara_req.vtype.vsew = eew_q[ara_req.vd]; + end 6'b100100: ara_req.op = ara_pkg::VFMUL; 6'b100000: ara_req.op = ara_pkg::VFDIV; 6'b100001: ara_req.op = ara_pkg::VFRDIV; @@ -2527,16 +2723,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Ara can support 16-bit float, 32-bit float, 64-bit float. // Ara cannot support instructions who operates on more than 64 bits. unique case (FPUSupport) - FPUSupportHalfSingleDouble: if (int'(ara_req.vtype.vsew) < int'(EW16) || - int'(ara_req.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1; - FPUSupportHalfSingle: if (int'(ara_req.vtype.vsew) < int'(EW16) || - int'(ara_req.vtype.vsew) > int'(EW32)) illegal_insn = 1'b1; - FPUSupportSingleDouble: if (int'(ara_req.vtype.vsew) < int'(EW32) || - int'(ara_req.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1; - FPUSupportHalf: if (int'(ara_req.vtype.vsew) != int'(EW16)) illegal_insn = 1'b1; - FPUSupportSingle: if (int'(ara_req.vtype.vsew) != int'(EW32)) + FPUSupportHalfSingleDouble: if (int'(csr_vtype_q.vsew) < int'(EW16) || + int'(csr_vtype_q.vsew) > int'(EW64)) illegal_insn = 1'b1; + FPUSupportHalfSingle: if (int'(csr_vtype_q.vsew) < int'(EW16) || + int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + FPUSupportSingleDouble: if (int'(csr_vtype_q.vsew) < int'(EW32) || + int'(csr_vtype_q.vsew) > int'(EW64)) illegal_insn = 1'b1; + FPUSupportHalf: if (int'(csr_vtype_q.vsew) != int'(EW16)) illegal_insn = 1'b1; + FPUSupportSingle: if (int'(csr_vtype_q.vsew) != int'(EW32)) illegal_insn = 1'b1; - FPUSupportDouble: if (int'(ara_req.vtype.vsew) != int'(EW64)) + FPUSupportDouble: if (int'(csr_vtype_q.vsew) != int'(EW64)) illegal_insn = 1'b1; default: illegal_insn = 1'b1; // Unsupported configuration endcase diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index 3ddcfa6eb..6a3dd2b52 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -259,12 +259,15 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: vtype : pe_req.vtype, default : '0 }; + vfu_operation_d.vtype.vsew = pe_req.op inside {[VMFEQ:VMSGT]} ? pe_req.eew_vs2 : pe_req.vtype.vsew; vfu_operation_valid_d = (vfu_operation_d.vfu != VFU_None) ? 1'b1 : 1'b0; // Vector length calculation vfu_operation_d.vl = pe_req.vl / NrLanes; // If lane_id_i < vl % NrLanes, this lane has to execute one extra micro-operation. - if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0]) vfu_operation_d.vl += 1; + // Also, if the ALU/VMFPU should pre-process data for the MASKU, force a balanced payload + if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0] || (|pe_req.vl[idx_width(NrLanes)-1:0] && pe_req.op inside {[VMFEQ:VMXNOR]})) + vfu_operation_d.vl += 1; // Calculate the start element for Lane[i]. This will be forwarded to both opqueues // and operand requesters, with some light modification in the case of a vslide. @@ -277,9 +280,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: vinsn_running_d[pe_req.id] = (vfu_operation_d.vfu != VFU_None) ? 1'b1 : 1'b0; // Mute request if the instruction runs in the lane and the vl is zero. - // Exception 1: insn on mask vectors, as MASKU has to receive something from all lanes - // and the partial results come from VALU and VMFPU. - // Exception 2: during a reduction, all the lanes must cooperate anyway. + // Exception: during a reduction, all the lanes must cooperate anyway. if (vfu_operation_d.vl == '0 && (vfu_operation_d.vfu inside {VFU_Alu, VFU_MFpu}) && !(vfu_operation_d.op inside {[VREDSUM:VWREDSUM], [VFREDUSUM:VFWREDOSUM]})) begin vfu_operation_valid_d = 1'b0; // We are already done with this instruction @@ -337,17 +338,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, - eew : pe_req.vtype.vsew, + eew : EW64, vtype : pe_req.vtype, - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), + vl : pe_req.vl / NrLanes / ELEN, vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl) + operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; end VFU_MFpu: begin @@ -420,17 +421,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, - eew : pe_req.vtype.vsew, + eew : EW64, vtype : pe_req.vtype, - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), + vl : pe_req.vl / NrLanes / ELEN, vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl) + operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; end VFU_LoadUnit : begin @@ -438,17 +439,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, - eew : pe_req.vtype.vsew, + eew : EW64, vtype : pe_req.vtype, - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), + vl : pe_req.vl / NrLanes / ELEN, vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl) + operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; // Load indexed @@ -490,26 +491,25 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // extra operand regardless of whether it is valid in this lane or not. // This is done to balance the data received by the store unit, which expects // L*64-bits packets only. - if (lane_id_i > pe_req.end_lane) begin + if (lane_id_i > pe_req.end_lane) operand_request[StA].vl += 1; - end operand_request_push[StA] = pe_req.use_vs1; // This vector instruction uses masks operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, - eew : pe_req.vtype.vsew, + eew : EW64, vtype : pe_req.vtype, - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), + vl : pe_req.vl / NrLanes / ELEN, vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl) + operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; // Store indexed @@ -529,9 +529,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) begin + if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) operand_request[SlideAddrGenA].vl += 1; - end operand_request_push[SlideAddrGenA] = pe_req_i.op == VSXE; end @@ -601,7 +600,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, - eew : pe_req.vtype.vsew, + eew : EW64, is_slide: 1'b1, vtype : pe_req.vtype, vstart : vfu_operation_d.vstart, @@ -614,61 +613,61 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: VSLIDEUP: begin // We need to trim full words from the end of the vector that are not used // as operands by the slide unit. + operand_request[MaskM].vl = (pe_req.vl - pe_req.stride) / NrLanes / ELEN; + // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request[MaskM].vl = - ((pe_req.vl - pe_req.stride + NrLanes - 1) / 8 / NrLanes) - >> unsigned'(pe_req.vtype.vsew); - - if (((operand_request[MaskM].vl + pe_req.stride) << - unsigned'(pe_req.vtype.vsew) * NrLanes * 8 != pe_req.vl)) + if ((operand_request[MaskM].vl) * NrLanes * ELEN != + pe_req.stride) operand_request[MaskM].vl += 1; // SLIDEUP only uses mask bits whose indices are > stride // Don't send the previous (unused) ones to the MASKU if (pe_req.stride >= NrLanes * 64) - operand_request[MaskM].vstart += ((pe_req.stride >> NrLanes * 64) << NrLanes * 64) / 8; + operand_request[MaskM].vstart += ((pe_req.stride >> NrLanes * ELEN) << NrLanes * ELEN) / 8; end VSLIDEDOWN: begin // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request[MaskM].vl = ((pe_req.vl / NrLanes / 8) >> unsigned'( - pe_req.vtype.vsew)); - if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) + operand_request[MaskM].vl = pe_req.vl / NrLanes / ELEN; + if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl) operand_request[MaskM].vl += 1; end endcase end VFU_MaskUnit: begin + // todo: balance mask comparison source requested + // todo: + + // Mask logical and integer comparisons operand_request[AluA] = '{ id : pe_req.id, vs : pe_req.vs1, - eew : pe_req.eew_vs1, scale_vl: pe_req.scale_vl, vtype : pe_req.vtype, vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, default : '0 }; + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. - // This is an operation that runs normally on the ALU, and then gets *condensed* and - // reshuffled at the Mask Unit. + // Integer comparisons run on the ALU and then get reshuffled and masked in the MASKU if (pe_req.op inside {[VMSEQ:VMSBC]}) begin - operand_request[AluA].vl = vfu_operation_d.vl; - end - // This is an operation that runs normally on the ALU, and then gets reshuffled at the - // Mask Unit. - else begin - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. - operand_request[AluA].vl = (pe_req.vl / NrLanes) >> - (unsigned'(EW64) - unsigned'(pe_req.eew_vs1)); - if ((operand_request[AluA].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs1))) * NrLanes != - pe_req.vl) operand_request[AluA].vl += 1; + // These source regs contain non-mask vectors. + operand_request[AluA].eew = pe_req.eew_vs1; + operand_request[AluA].vl = pe_req.vl / NrLanes; + if ((operand_request[AluA].vl * NrLanes) != pe_req.vl) + operand_request[AluA].vl += 1; + end else begin // Mask logical operations + // These source regs contain mask vectors. + operand_request[AluA].eew = EW64; + operand_request[AluA].vl = pe_req.vl / NrLanes / ELEN; + if (operand_request[AluA].vl * NrLanes * ELEN != pe_req.vl) + operand_request[AluA].vl += 1; end - operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE], VCPOP, VMSIF, VMSOF, VMSBF}); + operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE]}); + // Mask logical, integer comparisons, VIOTA, VID, VMSBF, VMSIF, VMSOF, VCPOP, VFIRST operand_request[AluB] = '{ id : pe_req.id, vs : pe_req.vs2, @@ -679,88 +678,117 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, default : '0 }; - // This is an operation that runs normally on the ALU, and then gets *condensed* and - // reshuffled at the Mask Unit. + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + + // Integer comparisons run on the ALU and then get reshuffled and masked in the MASKU if (pe_req.op inside {[VMSEQ:VMSBC]}) begin - operand_request[AluB].vl = vfu_operation_d.vl; + // These source regs contain non-mask vectors. + operand_request[AluB].eew = pe_req.eew_vs2; + operand_request[AluB].vl = pe_req.vl / NrLanes; + if ((operand_request[AluB].vl * NrLanes) != pe_req.vl) + operand_request[AluB].vl += 1; + end else begin // Mask logical, VIOTA, VID, VMSBF, VMSIF, VMSOF, VCPOP, VFIRST + // These source regs contain mask vectors. + operand_request[AluB].eew = EW64; + operand_request[AluB].vl = pe_req.vl / NrLanes / ELEN; + if (operand_request[AluB].vl * NrLanes * ELEN != pe_req.vl) + operand_request[AluB].vl += 1; end - // This is an operation that runs normally on the ALU, and then gets reshuffled at the - // Mask Unit. - else begin - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. - operand_request[AluB].vl = (pe_req.vl / NrLanes) >> - (unsigned'(EW64) - unsigned'(pe_req.eew_vs2)); - if ((operand_request[AluB].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs2))) * NrLanes != - pe_req.vl) operand_request[AluB].vl += 1; - end - operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE], VCPOP, VMSIF, VMSOF, VMSBF, VFIRST}); + operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE]}); + // Mask fp comparisons operand_request[MulFPUA] = '{ id : pe_req.id, vs : pe_req.vs1, eew : pe_req.eew_vs1, scale_vl: pe_req.scale_vl, + vl : pe_req.vl / NrLanes, vtype : pe_req.vtype, vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, default : '0 }; - - // This is an operation that runs normally on the ALU, and then gets *condensed* and + // This is an operation that runs normally on the VMFPU, and then gets *condensed* and // reshuffled at the Mask Unit. - operand_request[MulFPUA].vl = vfu_operation_d.vl; - operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]} && !(pe_req.op inside {VCPOP, VMSIF, VMSOF, VMSBF}); + // Request a balanced load from every lane despite it being active or not. + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if ((operand_request[MulFPUA].vl * NrLanes) != pe_req.vl) + operand_request[MulFPUA].vl += 1; + operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]}; + // Mask fp comparisons operand_request[MulFPUB] = '{ id : pe_req.id, vs : pe_req.vs2, eew : pe_req.eew_vs2, scale_vl: pe_req.scale_vl, + vl : pe_req.vl / NrLanes, vtype : pe_req.vtype, vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, default : '0 }; - // This is an operation that runs normally on the ALU, and then gets *condensed* and + // This is an operation that runs normally on the VMFPU, and then gets *condensed* and // reshuffled at the Mask Unit. - operand_request[MulFPUB].vl = vfu_operation_d.vl; - operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]} && !(pe_req.op inside {VCPOP, VMSIF, VMSOF, VMSBF, VFIRST}); + // Request a balanced load from every lane despite it being active or not. + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if ((operand_request[MulFPUB].vl * NrLanes) != pe_req.vl) + operand_request[MulFPUB].vl += 1; + operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]}; + // Vd register to provide correct mask undisturbed policy at bit-level + // This is can be a mask or normal register operand_request[MaskB] = '{ id : pe_req.id, - vs : pe_req.vs2, - eew : pe_req.eew_vs2, + vs : pe_req.vd, scale_vl: pe_req.scale_vl, vtype : pe_req.vtype, - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / ELEN) << (unsigned'(EW64) - unsigned'(pe_req.vtype.vsew)), vstart : vfu_operation_d.vstart, - hazard : (pe_req.op inside {VMSBF, VMSOF, VMSIF}) ? pe_req.hazard_vs2 : pe_req.hazard_vs2 | pe_req.hazard_vd, + hazard : pe_req.hazard_vd, default : '0 }; - operand_request[MaskB].vl = pe_req.vl / (NrLanes * (8 << pe_req.vtype.vsew)); - if ((pe_req.vl % (NrLanes*ELEN)) != 0) begin - operand_request[MaskB].vl += 1'b1; + // vl and eew depend on the real eew on which we are working on + if (pe_req.op inside {VIOTA,VID}) begin + // Non-mask layout + operand_request[MaskB].eew = pe_req.vtype.vsew; + operand_request[MaskB].vl = pe_req.vl / NrLanes; + // Request a balanced load from every lane despite it being active or not. + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if ((operand_request[MaskB].vl * NrLanes) != pe_req.vl) + operand_request[MaskB].vl += 1; + end else begin // Mask logical, comparisons, VMSBF, VMSIF, VMSOF + // Mask layout + operand_request[MaskB].eew = EW64; + operand_request[MaskB].vl = (pe_req.vl / NrLanes / ELEN); + // Request a balanced load from every lane despite it being active or not. + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if ((operand_request[MaskB].vl * NrLanes * ELEN) != pe_req.vl) + operand_request[MaskB].vl += 1; end - operand_request_push[MaskB] = pe_req.use_vs2 && pe_req.op inside {VCPOP, VFIRST, VMSIF, VMSOF, VMSBF}; + operand_request_push[MaskB] = pe_req.use_vd_op; + // All masked operations + // This is always a mask register operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, - eew : pe_req.vtype.vsew, + eew : EW64, vtype : pe_req.vtype, - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. vl : (pe_req.vl / NrLanes / ELEN), vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm, default: '0 }; - if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl) begin + // Request a balanced load from every lane despite it being active or not. + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl) operand_request[MaskM].vl += 1; - end operand_request_push[MaskM] = !pe_req.vm; end VFU_None: begin diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv index 1baec0780..de2cc4f82 100644 --- a/hardware/src/lane/operand_requester.sv +++ b/hardware/src/lane/operand_requester.sv @@ -291,7 +291,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( automatic elen_t vl_byte; automatic elen_t vstart_byte; automatic elen_t vector_body_len_byte; - automatic elen_t vector_body_len_packets; + automatic elen_t vector_body_len_elements; // Bank we are currently requesting automatic int bank = requester_metadata_q.addr[idx_width(NrBanks)-1:0]; @@ -324,13 +324,13 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( ? 0 : operand_request_i[requester_index].vstart << operand_request_i[requester_index].vtype.vsew; vector_body_len_byte = vl_byte - vstart_byte + (vstart_byte % 8); - vector_body_len_packets = vector_body_len_byte >> operand_request_i[requester_index].eew; - if (vector_body_len_packets << operand_request_i[requester_index].eew < vector_body_len_byte) - vector_body_len_packets += 1; + vector_body_len_elements = vector_body_len_byte >> operand_request_i[requester_index].eew; + if (vector_body_len_elements << operand_request_i[requester_index].eew < vector_body_len_byte) + vector_body_len_elements += 1; // Final computed length effective_vector_body_length = (operand_request_i[requester_index].scale_vl) - ? vector_body_len_packets + ? vector_body_len_elements : vector_body_length; // Address of the vstart element of the vector in the VRF @@ -401,7 +401,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( end : waw_counters_update if (operand_queue_ready_i[requester_index]) begin - automatic vlen_t num_bytes; + automatic vlen_t num_elements; // Operand request lane_operand_req_transposed[requester_index][bank] = !stall; @@ -417,12 +417,12 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( requester_metadata_d.addr = requester_metadata_q.addr + 1'b1; // We read less than 64 bits worth of elements - num_bytes = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) ); - if (requester_metadata_q.len < num_bytes) begin + num_elements = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) ); + if (requester_metadata_q.len < num_elements) begin requester_metadata_d.len = 0; end else begin - requester_metadata_d.len = requester_metadata_q.len - num_bytes; + requester_metadata_d.len = requester_metadata_q.len - num_elements; end end : op_req_grant diff --git a/hardware/src/lane/simd_alu.sv b/hardware/src/lane/simd_alu.sv index 242c0d2bc..572bc35af 100644 --- a/hardware/src/lane/simd_alu.sv +++ b/hardware/src/lane/simd_alu.sv @@ -132,11 +132,8 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( VMXOR : res = operand_a_i ^ operand_b_i; VMXNOR : res = ~(operand_a_i ^ operand_b_i); - // vmsbf, vmsof, vmsif and viota operand generation - VMSBF, VMSOF, VMSIF, VIOTA : res = opb; - - // Vector count population and find first set bit instructions - VCPOP, VFIRST : res = operand_b_i; + // Mask operands pass-through + VCPOP, VFIRST, VMSBF, VMSOF, VMSIF, VIOTA: res = operand_b_i; // Arithmetic instructions VSADDU: if (FixPtSupport == FixedPointEnable) unique case (vew_i) diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv index d3ce82bee..53a14e177 100644 --- a/hardware/src/lane/valu.sv +++ b/hardware/src/lane/valu.sv @@ -175,22 +175,25 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Mask operands // ///////////////////// + logic mask_operand_valid; logic mask_operand_ready; logic mask_operand_gnt; - assign mask_operand_gnt = mask_operand_ready && result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q]; + assign mask_operand_valid = result_queue_q[result_queue_read_pnt_q].mask + & result_queue_valid_q[result_queue_read_pnt_q]; + assign mask_operand_gnt = mask_operand_valid & mask_operand_ready; spill_register #( .T(elen_t) ) i_mask_operand_register ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .data_o (mask_operand_o ), - .valid_o (mask_operand_valid_o ), - .ready_i (mask_operand_ready_i ), - .data_i (result_queue_q[result_queue_read_pnt_q].wdata ), - .valid_i (result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q]), - .ready_o (mask_operand_ready ) + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .data_o (mask_operand_o ), + .valid_o (mask_operand_valid_o ), + .ready_i (mask_operand_ready_i ), + .data_i (result_queue_q[result_queue_read_pnt_q].wdata ), + .valid_i (mask_operand_valid ), + .ready_o (mask_operand_ready ) ); ////////////////////// @@ -395,6 +398,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Remaining elements of the current instruction in the commit phase vlen_t commit_cnt_d, commit_cnt_q; + // How many elements are issued/committed + logic [3:0] element_cnt_buf_issue, element_cnt_buf_commit; + logic [6:0] element_cnt_issue; + logic [6:0] element_cnt_commit; + always_comb begin: p_valu // Maintain state vinsn_queue_d = vinsn_queue_q; @@ -436,6 +444,13 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Don't prevent commit by default prevent_commit = 1'b0; + // How many elements are we processing this cycle? + element_cnt_buf_issue = 1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)); + element_cnt_issue = vinsn_issue_q.op inside {[VMSBF:VMXNOR]} ? ELEN : {2'b0, element_cnt_buf_issue}; + + element_cnt_buf_commit = 1 << (unsigned'(EW64) - unsigned'(vinsn_commit.vtype.vsew)); + element_cnt_commit = vinsn_commit.op inside {[VMSBF:VMXNOR]} ? ELEN : {2'b0, element_cnt_buf_commit}; + //////////////////////////////////////// // Write data into the result queue // //////////////////////////////////////// @@ -450,7 +465,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1) && (mask_valid_i || vinsn_issue_q.vm)) begin // How many elements are we committing with this word? - automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew))); + automatic logic [6:0] element_cnt = element_cnt_issue; if (element_cnt > issue_cnt_q) element_cnt = issue_cnt_q; @@ -524,16 +539,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1; // Assign vector length for next instruction in the instruction queue - if (vinsn_queue_d.issue_cnt != 0) begin - if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; - else begin - $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); - issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >> - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew; - issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0]; - end - end + if (vinsn_queue_d.issue_cnt != 0) + issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; end end end @@ -550,7 +557,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1 || !first_op_q) && (mask_valid_i || vinsn_issue_q.vm)) begin // How many elements are we committing with this word? - automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew))); + automatic logic [6:0] element_cnt = element_cnt_issue; + if (element_cnt > issue_cnt_q) element_cnt = issue_cnt_q; @@ -656,16 +664,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1; // Assign vector length for next instruction in the instruction queue - if (vinsn_queue_d.issue_cnt != 0) begin - if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; - else begin - $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); - issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >> - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew; - issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0]; - end - end + if (vinsn_queue_d.issue_cnt != 0) + issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; // Give the done to the main sequencer commit_cnt_d = '0; @@ -693,16 +693,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1; // Assign vector length for next instruction in the instruction queue - if (vinsn_queue_d.issue_cnt != 0) begin - if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; - else begin - $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); - issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >> - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew; - issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0]; - end - end + if (vinsn_queue_d.issue_cnt != 0) + issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; // Commit and give the done to the main sequencer commit_cnt_d = '0; @@ -739,7 +731,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; if (|result_queue_valid_q) vxsat_flag_o = |(alu_vxsat_q & result_queue_q[result_queue_read_pnt_q].be); - // Received a grant from the VRF. + // Received a grant from the VRF or MASKU. // Deactivate the request. if (alu_result_gnt_i || mask_operand_gnt) begin result_queue_valid_d[result_queue_read_pnt_q] = 1'b0; @@ -754,9 +746,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Decrement the counter of remaining vector elements waiting to be written // Don't do it in case of a reduction - if (!is_reduction(vinsn_commit.op)) - commit_cnt_d = commit_cnt_q - (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew)); - if (commit_cnt_q < (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew))) commit_cnt_d = '0; + if (!is_reduction(vinsn_commit.op)) begin + automatic logic [6:0] element_cnt = element_cnt_commit; + commit_cnt_d = commit_cnt_q - element_cnt; + if (commit_cnt_q < element_cnt) commit_cnt_d = '0; + end end // Finished committing the results of a vector instruction @@ -770,18 +764,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; else vinsn_queue_d.commit_pnt += 1; // Update the commit counter for the next instruction - if (vinsn_queue_d.commit_cnt != '0) begin - if (!(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].op inside {[VMANDNOT:VMXNOR]})) - commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl; - else begin - // We are asking for bits, and we want at least one chunk of bits if - // vl > 0. Therefore, commit_cnt = ceil((vl / 8) >> sew) - $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); - commit_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl / 8) >> - vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew; - commit_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl[2:0]; - end - end + if (vinsn_queue_d.commit_cnt != '0) + commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl; // Initialize counters and alu state if needed by the next instruction // After a reduction, the next instructions starts after the reduction commits @@ -806,7 +790,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i; // Do not wait for masks if, during a reduction, this lane is just a pass-through // The only valid instructions here with vl == '0 are reductions - vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.vm | (vfu_operation_i.vl == '0); + // Instructions that execute in the mask unit will process the mask there directly + // VMADC/VMSBC requires mask bits in the ALU + vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.op inside {[VMSEQ:VMXNOR]} && !(vfu_operation_i.op inside {[VMADC:VMSBC]}) + ? 1'b1 + : vfu_operation_i.vm | (vfu_operation_i.vl == '0); // Initialize counters and alu state if the instruction queue was empty // and the lane is not reducing @@ -822,22 +810,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; sldu_transactions_cnt_d = $clog2(NrLanes) + 1; issue_cnt_d = vfu_operation_i.vl; - if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vfu_operation_i.vl; - else begin - issue_cnt_d = (vfu_operation_i.vl / 8) >> - vfu_operation_i.vtype.vsew; - issue_cnt_d += |vfu_operation_i.vl[2:0]; - end end if (vinsn_queue_d.commit_cnt == '0) - if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]})) - commit_cnt_d = vfu_operation_i.vl; - else begin - // Operations between mask vectors operate on bits - commit_cnt_d = (vfu_operation_i.vl / 8) >> vfu_operation_i.vtype.vsew; - commit_cnt_d += |vfu_operation_i.vl[2:0]; - end + commit_cnt_d = vfu_operation_i.vl; // Bump pointers and counters of the vector instruction queue vinsn_queue_d.accept_pnt += 1; diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv index bbeb78f32..846638243 100644 --- a/hardware/src/lane/vmfpu.sv +++ b/hardware/src/lane/vmfpu.sv @@ -1245,21 +1245,18 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; (vinsn_processing_q.op == VMFNE) ? ~vfpu_processed_result[16*b] : vfpu_processed_result[16*b]; - for (int b = 0; b < 4; b++) vfpu_processed_result[16*b+1] = vfpu_mask[2*b]; end EW32: begin for (int b = 0; b < 2; b++) vfpu_processed_result[32*b] = (vinsn_processing_q.op == VMFNE) ? ~vfpu_processed_result[32*b] : vfpu_processed_result[32*b]; - for (int b = 0; b < 2; b++) vfpu_processed_result[32*b+1] = vfpu_mask[4*b]; end EW64: begin for (int b = 0; b < 1; b++) vfpu_processed_result[b] = (vinsn_processing_q.op == VMFNE) ? ~vfpu_processed_result[b] : vfpu_processed_result[b]; - for (int b = 0; b < 1; b++) vfpu_processed_result[b+1] = vfpu_mask[8*b]; end endcase end @@ -2180,7 +2177,15 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; if (!vinsn_queue_full && vfu_operation_valid_i && (vfu_operation_i.vfu == VFU_MFpu || vfu_operation_i.op inside {[VMFEQ:VMFGE]})) begin - vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i; + vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i; + // Masks are handled in the MASKU directly for comparisons + vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.op inside {[VMFEQ:VMFGE]} + ? 1'b1 + : vfu_operation_i.vm; + // During comparisons, vd_op is for the masku, not for the VMFPU + vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].use_vd_op = vfu_operation_i.op inside {[VMFEQ:VMFGE]} + ? 1'b0 + : vfu_operation_i.use_vd_op; // Initialize counters if (vinsn_queue_d.issue_cnt == '0 && !prevent_commit) begin diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv index 1ea497dc4..e83998965 100644 --- a/hardware/src/masku/masku.sv +++ b/hardware/src/masku/masku.sv @@ -65,6 +65,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( vlen_t read_cnt_d, read_cnt_q; // Remaining elements of the current instruction in the issue phase vlen_t issue_cnt_d, issue_cnt_q; + // Remaining elements of the current instruction to be validated in the result queue + vlen_t processing_cnt_d, processing_cnt_q; // Remaining elements of the current instruction in the commit phase vlen_t commit_cnt_d, commit_cnt_q; @@ -81,19 +83,17 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( logic [NrLanes-1:0] masku_operand_alu_ready; // ALU/FPU result (deshuffled) - logic [NrLanes*ELEN-1:0] masku_operand_alu_seq; + logic [NrLanes*DataWidth-1:0] masku_operand_alu_seq; - // vs2 (shuffled) - elen_t [NrLanes-1:0] masku_operand_vs2; - logic [NrLanes-1:0] masku_operand_vs2_valid; - logic [NrLanes-1:0] masku_operand_vs2_ready; + // vd (shuffled) + elen_t [NrLanes-1:0] masku_operand_vd; + logic [NrLanes-1:0] masku_operand_vd_valid; + logic [NrLanes-1:0] masku_operand_vd_ready; - assign masku_operand_vs2_ready = 1'b0; - - // vs2 (deshuffled) - logic [NrLanes*ELEN-1:0] masku_operand_vs2_seq; - logic [ NrLanes-1:0] masku_operand_vs2_seq_valid; - logic [ NrLanes-1:0] masku_operand_vs2_seq_ready; + // vd (deshuffled) + logic [NrLanes*DataWidth-1:0] masku_operand_vd_seq; + logic [ NrLanes-1:0] masku_operand_vd_seq_valid; + logic [ NrLanes-1:0] masku_operand_vd_seq_ready; // Mask elen_t [NrLanes-1:0] masku_operand_m; @@ -101,15 +101,13 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( logic [NrLanes-1:0] masku_operand_m_ready; // Mask deshuffled - logic [NrLanes*ELEN-1:0] masku_operand_m_seq; - logic [NrLanes-1:0] masku_operand_m_seq_valid; - logic [NrLanes-1:0] masku_operand_m_seq_ready; + logic [NrLanes*DataWidth-1:0] masku_operand_m_seq; // Insn-queue related signal pe_req_t vinsn_issue; - logic [NrLanes*ELEN-1:0] bit_enable_mask; - logic [NrLanes*ELEN-1:0] alu_result_compressed; + logic [NrLanes*DataWidth-1:0] bit_enable_mask; + logic [NrLanes*DataWidth-1:0] alu_result_compressed_seq; // Performs all shuffling and deshuffling of mask operands (including masks for mask instructions) // Furthermore, it buffers certain operands that would create long critical paths @@ -133,72 +131,100 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( .masku_operand_alu_valid_o ( masku_operand_alu_valid ), .masku_operand_alu_ready_i ( masku_operand_alu_ready ), .masku_operand_alu_seq_o ( masku_operand_alu_seq ), - .masku_operand_alu_seq_valid_o ( ), - .masku_operand_alu_seq_ready_i ( ), - .masku_operand_vs2_o ( masku_operand_vs2 ), - .masku_operand_vs2_valid_o ( masku_operand_vs2_valid ), - .masku_operand_vs2_ready_i ( masku_operand_vs2_ready ), - .masku_operand_vs2_seq_o ( masku_operand_vs2_seq ), - .masku_operand_vs2_seq_valid_o ( masku_operand_vs2_seq_valid ), - .masku_operand_vs2_seq_ready_i ( masku_operand_vs2_seq_ready ), + .masku_operand_alu_seq_valid_o ( ), + .masku_operand_alu_seq_ready_i ( '0 ), + .masku_operand_vd_o ( masku_operand_vd ), + .masku_operand_vd_valid_o ( masku_operand_vd_valid ), + .masku_operand_vd_ready_i ( masku_operand_vd_ready ), + .masku_operand_vd_seq_o ( masku_operand_vd_seq ), + .masku_operand_vd_seq_valid_o ( masku_operand_vd_seq_valid ), + .masku_operand_vd_seq_ready_i ( '0 ), .masku_operand_m_o ( masku_operand_m ), .masku_operand_m_valid_o ( masku_operand_m_valid ), .masku_operand_m_ready_i ( masku_operand_m_ready ), .masku_operand_m_seq_o ( masku_operand_m_seq ), - .masku_operand_m_seq_valid_o ( ), - .masku_operand_m_seq_ready_i ( ), + .masku_operand_m_seq_valid_o ( ), + .masku_operand_m_seq_ready_i ( '0 ), .bit_enable_mask_o ( bit_enable_mask ), - .alu_result_compressed_o ( alu_result_compressed ) + .alu_result_compressed_seq_o ( alu_result_compressed_seq ) ); + // Local Parameter for mask logical instructions + // + // Don't change this parameter! + localparam integer unsigned VmLogicalParallelism = NrLanes*DataWidth; - // Local Parameter W_CPOP and W_VFIRST + // Local Parameter VMSBF, VMSIF, VMSOF + // + localparam integer unsigned VmsxfParallelism = NrLanes < 4 ? 2 : NrLanes/2; + // Ancillary signals + logic [VmsxfParallelism-1:0] vmsbf_buffer; + logic [NrLanes*DataWidth-1:0] alu_result_vmsif_vm; + logic [NrLanes*DataWidth-1:0] alu_result_vmsbf_vm; + logic [NrLanes*DataWidth-1:0] alu_result_vmsof_vm; + + // Local Parameter VIOTA, VID + // + // How many output results are computed in parallel by VIOTA + localparam integer unsigned ViotaParallelism = NrLanes < 4 ? 2 : NrLanes/2; + // Check if parameters are within range + if (ViotaParallelism > NrLanes || ViotaParallelism % 2 != 0) begin + $fatal(1, "Parameter ViotaParallelism cannot be higher than NrLanes and should be a power of 2."); + end + // VLENMAX can be 64Ki elements at most - 16 bit per adder are enough + logic [ViotaParallelism-1:0] [idx_width(RISCV_MAX_VLEN)-1:0] viota_res; + logic [idx_width(RISCV_MAX_VLEN)-1:0] viota_acc, viota_acc_d, viota_acc_q; + // Ancillary signal to tweak the VRF byte-enable, accounting for an unbalanced write, + // i.e., when the number of elements does not perfectly divide NrLanes + logic [3:0] elm_per_lane; // From 0 to 8 elements per lane + logic [NrLanes-1:0] additional_elm; // There can be an additional element for some lanes + // BE signals for VIOTA + logic [NrLanes*DataWidth/8-1:0] be_viota_seq_d, be_viota_seq_q, be_viota_shuf; + + // Local Parameter VcpopParallelism and VfirstParallelism // - // Description: Parameters W_CPOP and W_VFIRST enable time multiplexing of vcpop.m and vfirst.m instruction. + // Description: Parameters VcpopParallelism and VfirstParallelism enable time multiplexing of vcpop.m and vfirst.m instruction. // - // Legal range W_CPOP: {16, 32, 64, 128, ... , DataWidth*NrLanes} // DataWidth = 64 - // Legal range W_VFIRST: {16, 32, 64, 128, ... , DataWidth*NrLanes} // DataWidth = 64 + // Legal range VcpopParallelism: {16, 32, 64, 128, ... , DataWidth*NrLanes} // DataWidth = 64 + // Legal range VfirstParallelism: {16, 32, 64, 128, ... , DataWidth*NrLanes} // DataWidth = 64 // // Execution time example for vcpop.m (similar for vfirst.m): - // W_CPOP = 64; VLEN = 1024; vl = 1024 - // t_vcpop.m = VLEN/W_CPOP = 8 [Cycles] - localparam int W_CPOP = 16; - localparam int W_VFIRST = 16; + // VcpopParallelism = 64; VLEN = 1024; vl = 1024 + // t_vcpop.m = VLEN/VcpopParallelism = 8 [Cycles] + localparam int VcpopParallelism = 16; + localparam int VfirstParallelism = 16; // derived parameters - localparam int MAX_W_CPOP_VFIRST = (W_CPOP > W_VFIRST) ? W_CPOP : W_VFIRST; - localparam int N_SLICES_CPOP = NrLanes * DataWidth / W_CPOP; - localparam int N_SLICES_VFIRST = NrLanes * DataWidth / W_VFIRST; + localparam int MAX_VcpopParallelism_VFIRST = (VcpopParallelism > VfirstParallelism) ? VcpopParallelism : VfirstParallelism; + localparam int N_SLICES_CPOP = NrLanes * DataWidth / VcpopParallelism; + localparam int N_SLICES_VFIRST = NrLanes * DataWidth / VfirstParallelism; // Check if parameters are within range - if (((W_CPOP & (W_CPOP - 1)) != 0) || (W_CPOP < 8)) begin - $fatal(1, "Parameter W_CPOP must be power of 2."); - end else if (((W_VFIRST & (W_VFIRST - 1)) != 0) || (W_VFIRST < 8)) begin - $fatal(1, "Parameter W_VFIRST must be power of 2."); + if (((VcpopParallelism & (VcpopParallelism - 1)) != 0) || (VcpopParallelism < 8)) begin + $fatal(1, "Parameter VcpopParallelism must be power of 2."); + end else if (((VfirstParallelism & (VfirstParallelism - 1)) != 0) || (VfirstParallelism < 8)) begin + $fatal(1, "Parameter VfirstParallelism must be power of 2."); end // VFIRST and VCPOP Signals - logic [NrLanes*ELEN-1:0] vcpop_operand; - logic [$clog2(W_CPOP):0] popcount; + logic [NrLanes*DataWidth-1:0] vcpop_operand; + logic [$clog2(VcpopParallelism):0] popcount; logic [$clog2(VLEN):0] popcount_d, popcount_q; - logic [$clog2(W_VFIRST)-1:0] vfirst_count; + logic [$clog2(VfirstParallelism)-1:0] vfirst_count; logic [$clog2(VLEN)-1:0] vfirst_count_d, vfirst_count_q; logic vfirst_empty; - logic [NrLanes-1:0] vcpop_vfirst_vs2_ready; // counter to keep track of how many slices of the vcpop_operand have been processed - logic [$clog2(MAX_W_CPOP_VFIRST):0] vcpop_slice_cnt_d, vcpop_slice_cnt_q; - logic [W_CPOP-1:0] vcpop_slice; - logic [W_VFIRST-1:0] vfirst_slice; + logic [VcpopParallelism-1:0] vcpop_slice; + logic [VfirstParallelism-1:0] vfirst_slice; // vmsbf, vmsif, vmsof, viota, vid, vcpop, vfirst variables - logic [NrLanes*DataWidth-1:0] alu_result_f, alu_result_ff; - logic [NrLanes*DataWidth-1:0] masku_operand_alu_seq_m, masku_operand_alu_seq_f, masku_operand_alu_seq_ff; - logic [NrLanes*DataWidth-1:0] alu_result_vm, alu_result_vm_m, alu_result_vm_seq; - logic [NrLanes*DataWidth-1:0] alu_src_idx, alu_src_idx_m; - logic [ 13:0] iteration_count_d, iteration_count_q; - logic not_found_one_d, not_found_one_q; - logic [ NrLanes-1:0] vmsif_vmsof_vmsbf_vs2_ready; + logic [NrLanes*DataWidth-1:0] masku_operand_alu_seq_m; + logic [NrLanes*DataWidth-1:0] alu_result_vm, alu_result_vm_m, alu_result_vm_shuf; + logic found_one, found_one_d, found_one_q; - // Control flow for mask operands - assign masku_operand_vs2_seq_ready = vcpop_vfirst_vs2_ready | vmsif_vmsof_vmsbf_vs2_ready; + // How many elements we are processing per cycle + logic [idx_width(NrLanes*DataWidth):0] delta_elm_d, delta_elm_q; + + // MASKU Alu: is a VRF word result or a scalar result fully valid? + logic out_vrf_word_valid, out_scalar_valid; //////////////////////////////// // Vector instruction queue // @@ -311,7 +337,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // and one pointer to indicate which `payload_t` we are currently // reading from and writing into the lanes (read_pnt). logic [idx_width(ResultQueueDepth)-1:0] result_queue_write_pnt_d, result_queue_write_pnt_q; - logic [idx_width(ResultQueueDepth)-1:0] result_queue_read_pnt_d, result_queue_read_pnt_q, result_queue_read_pnt_m; + logic [idx_width(ResultQueueDepth)-1:0] result_queue_read_pnt_d, result_queue_read_pnt_q; // We need to count how many valid elements are there in this result queue. logic [idx_width(ResultQueueDepth):0] result_queue_cnt_d, result_queue_cnt_q; // Vector to register the final grants from the operand requesters, which indicate @@ -319,6 +345,11 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // that the result was accepted by the operand requester stage logic [NrLanes-1:0] result_final_gnt_d, result_final_gnt_q; + // Result queue + elen_t [NrLanes-1:0] result_queue_background_data; + elen_t [NrLanes-1:0] result_queue_mask_seq; + logic [NrLanes*DataWidth-1:0] background_data_init_seq, background_data_init_shuf; + // Is the result queue full? logic result_queue_full; assign result_queue_full = (result_queue_cnt_q == ResultQueueDepth); @@ -332,41 +363,135 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( result_queue_valid_q <= '0; result_queue_write_pnt_q <= '0; result_queue_read_pnt_q <= '0; - result_queue_read_pnt_m <= '0; result_queue_cnt_q <= '0; - alu_result_f <= '0; - alu_result_ff <= '0; - not_found_one_q <= 1'b1; - masku_operand_alu_seq_f <= '0; - masku_operand_alu_seq_ff <= '0; - iteration_count_q <= '0; end else begin result_queue_q <= result_queue_d; result_queue_valid_q <= result_queue_valid_d; result_queue_write_pnt_q <= result_queue_write_pnt_d; - result_queue_read_pnt_m <= result_queue_write_pnt_q; - result_queue_read_pnt_q <= (vinsn_issue.op inside {[VMSBF:VID]}) ? result_queue_read_pnt_m : result_queue_read_pnt_d; + result_queue_read_pnt_q <= result_queue_read_pnt_d; result_queue_cnt_q <= result_queue_cnt_d; - alu_result_f <= (pe_req_ready_o) ? '0 : (!vinsn_issue.vm) ? alu_result_vm : alu_result_vm_seq; - alu_result_ff <= alu_result_f; - not_found_one_q <= not_found_one_d; - masku_operand_alu_seq_f <= (pe_req_ready_o) ? '0 : masku_operand_alu_seq_m; - masku_operand_alu_seq_ff <= masku_operand_alu_seq_f; - iteration_count_q <= iteration_count_d; end end - // iteration count for masked instrctions - always_comb begin - if (vinsn_issue_valid && (&masku_operand_alu_valid || &masku_operand_vs2_seq_valid)) begin - iteration_count_d = iteration_count_q + 1'b1; - end else begin - iteration_count_d = iteration_count_q; - end - if (pe_req_ready_o && !vinsn_issue_valid) begin - iteration_count_d = '0; - end - end + //////////////////// + // ALU counters // + //////////////////// + + // Compile-time minimum among five different numbers + function automatic int unsigned min5(int unsigned a, int unsigned b, int unsigned c, int unsigned d, int unsigned e); + return (a < b) ? ((a < c) ? ((a < d) ? ((a < e) ? a : e) : (d < e ? d : e)) + : (c < d) ? ((c < e) ? c : e) : (d < e ? d : e)) + : ((b < c) ? ((b < d) ? ((b < e) ? b : e) : (d < e ? d : e)) + : (c < d) ? ((c < e) ? c : e) : (d < e ? d : e)); + endfunction + + // What is the minimum supported parallelism? + localparam int unsigned MIN_MASKU_ALU_WIDTH = min5( + ViotaParallelism, + VmsxfParallelism, + VmLogicalParallelism, + VcpopParallelism, + VfirstParallelism + ); + + localparam int unsigned IN_READY_CNT_WIDTH = idx_width(NrLanes * DataWidth / MIN_MASKU_ALU_WIDTH); + typedef logic [IN_READY_CNT_WIDTH-1:0] in_ready_cnt_t; + logic in_ready_cnt_en, in_ready_cnt_clr; + in_ready_cnt_t in_ready_cnt_delta_q, in_ready_cnt_q; + in_ready_cnt_t in_ready_threshold_d, in_ready_threshold_q; + + assign in_ready_cnt_delta_q = 1; + + // Counter to trigger the input ready. + // Ready triggered when all the slices of the VRF word have been consumed. + delta_counter #( + .WIDTH(IN_READY_CNT_WIDTH) + ) i_in_ready_cnt ( + .clk_i, + .rst_ni, + .clear_i(in_ready_cnt_clr ), + .en_i (in_ready_cnt_en ), + .load_i (1'b0 ), + .down_i (1'b0 ), + .delta_i(in_ready_cnt_delta_q), + .d_i ('0 ), + .q_o (in_ready_cnt_q ), + .overflow_o(/* Unused */) + ); + + localparam int unsigned IN_M_READY_CNT_WIDTH = idx_width(NrLanes * DataWidth / MIN_MASKU_ALU_WIDTH); + typedef logic [IN_M_READY_CNT_WIDTH-1:0] in_m_ready_cnt_t; + logic in_m_ready_cnt_en, in_m_ready_cnt_clr; + in_m_ready_cnt_t in_m_ready_cnt_q, in_m_ready_cnt_delta_q; + in_ready_cnt_t in_m_ready_threshold_d, in_m_ready_threshold_q; + + assign in_m_ready_cnt_delta_q = 1; + + // Counter to trigger the input ready. + // Ready triggered when all the slices of the VRF word have been consumed. + delta_counter #( + .WIDTH(IN_M_READY_CNT_WIDTH) + ) i_in_m_ready_cnt ( + .clk_i, + .rst_ni, + .clear_i(in_m_ready_cnt_clr ), + .en_i (in_m_ready_cnt_en ), + .load_i (1'b0 ), + .down_i (1'b0 ), + .delta_i(in_m_ready_cnt_delta_q), + .d_i ('0 ), + .q_o (in_m_ready_cnt_q ), + .overflow_o(/* Unused */) + ); + + localparam int unsigned OUT_VALID_CNT_WIDTH = idx_width(NrLanes * DataWidth / MIN_MASKU_ALU_WIDTH); + typedef logic [OUT_VALID_CNT_WIDTH-1:0] out_valid_cnt_t; + logic out_valid_cnt_en, out_valid_cnt_clr; + out_valid_cnt_t out_valid_cnt_q, out_valid_cnt_delta_q; + out_valid_cnt_t out_valid_threshold_d, out_valid_threshold_q; + + assign out_valid_cnt_delta_q = 1; + + // Counter to trigger the output valid. + // Valid triggered when all the slices of the VRF word have been consumed. + delta_counter #( + .WIDTH(OUT_VALID_CNT_WIDTH) + ) i_out_valid_cnt ( + .clk_i, + .rst_ni, + .clear_i(out_valid_cnt_clr ), + .en_i (out_valid_cnt_en ), + .load_i (1'b0 ), + .down_i (1'b0 ), + .delta_i(out_valid_cnt_delta_q), + .d_i ('0 ), + .q_o (out_valid_cnt_q ), + .overflow_o(/* Unused */) + ); + + // How many (64*NrLanes)-bit VRF words we can get, maximum? + localparam int unsigned MAX_NUM_VRF_WORDS = VLEN / NrLanes / 8; + logic iteration_cnt_clr; + logic [idx_width(MAX_NUM_VRF_WORDS)-1:0] iteration_cnt_q, iteration_cnt_delta_q; + + assign iteration_cnt_delta_q = 1; + + // Iteration count for masked instructions + // One iteration == One full output slice processed + delta_counter #( + .WIDTH(idx_width(MAX_NUM_VRF_WORDS)) + ) i_iteration_cnt ( + .clk_i, + .rst_ni, + .clear_i(iteration_cnt_clr ), + .en_i (out_valid_cnt_clr ), + .load_i (1'b0 ), + .down_i (1'b0 ), + .delta_i(iteration_cnt_delta_q), + .d_i ('0 ), + .q_o (iteration_cnt_q ), + .overflow_o(/* Unused */) + ); //////////////////////////// //// Scalar result reg //// @@ -389,19 +514,15 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Mask ALU // //////////////// - elen_t [NrLanes-1:0] alu_result; - logic [NrLanes*ELEN-1:0] mask; - - // keep track if first 1 mask element was found - logic vfirst_found; + elen_t [NrLanes-1:0] alu_result; // assign operand slices to be processed by popcount and lzc - assign vcpop_slice = vcpop_operand[(vcpop_slice_cnt_q * W_CPOP) +: W_CPOP]; - assign vfirst_slice = vcpop_operand[(vcpop_slice_cnt_q * W_VFIRST) +: W_VFIRST]; + assign vcpop_slice = vcpop_operand[(in_ready_cnt_q[idx_width(N_SLICES_CPOP)-1:0] * VcpopParallelism) +: VcpopParallelism]; + assign vfirst_slice = vcpop_operand[(in_ready_cnt_q[idx_width(N_SLICES_VFIRST)-1:0] * VfirstParallelism) +: VfirstParallelism]; // Population count for vcpop.m instruction popcount #( - .INPUT_WIDTH (W_CPOP) + .INPUT_WIDTH (VcpopParallelism) ) i_popcount ( .data_i (vcpop_slice), .popcount_o(popcount ) @@ -409,7 +530,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Trailing zero counter lzc #( - .WIDTH(W_VFIRST), + .WIDTH(VfirstParallelism), .MODE (0) ) i_clz ( .in_i (vfirst_slice ), @@ -417,204 +538,190 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( .empty_o (vfirst_empty ) ); - always_comb begin: p_mask_alu - alu_result = '0; - not_found_one_d = pe_req_ready_o ? 1'b1 : not_found_one_q; - alu_result_vm = '0; - alu_result_vm_m = '0; - alu_result_vm_seq = '0; - masku_operand_alu_seq_m = '0; - mask = '0; - vcpop_operand = '0; + // Vector instructions currently running + logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q; - if (vinsn_issue_valid) begin + // Interface with the main sequencer + pe_resp_t pe_resp; - // Mask generation - unique case (vinsn_issue.op) inside - [VMSBF:VID] : - if (&masku_operand_alu_valid) begin - unique case (vinsn_issue.vtype.vsew) - EW8 : for (int i = 0; i < (DataWidth * NrLanes)/8; i++) - mask [(i*8) +: 8] = {8{bit_enable_mask [i+(((DataWidth * NrLanes)/8)*(iteration_count_d-1))]}}; - EW16: for (int i = 0; i < (DataWidth * NrLanes)/16; i++) - mask [(i*16) +: 16] = {16{bit_enable_mask [i+(((DataWidth * NrLanes)/16)*(iteration_count_d-1))]}}; - EW32: for (int i = 0; i < (DataWidth * NrLanes)/32; i++) - mask [(i*32) +: 32] = {32{bit_enable_mask [i+(((DataWidth * NrLanes)/32)*(iteration_count_d-1))]}}; - EW64: for (int i = 0; i < (DataWidth * NrLanes)/64; i++) - mask [(i*64) +: 64] = {64{bit_enable_mask [i+(((DataWidth * NrLanes)/64)*(iteration_count_d-1))]}}; - endcase - end else begin - mask = '0; - end - default:; - endcase + // Effective MASKU stride in case of VSLIDEUP + // MASKU receives chunks of 64 * NrLanes mask bits from the lanes + // VSLIDEUP only needs the bits whose index >= than its stride + // So, the operand requester does not send vl mask bits to MASKU + // and trims all the unused 64 * NrLanes mask bits chunks + // Therefore, the stride needs to be trimmed, too + elen_t trimmed_stride; + + // Information about which is the target FU of the request + assign masku_operand_fu = (vinsn_issue.op inside {[VMFEQ:VMFGE]}) ? MaskFUMFpu : MaskFUAlu; + + always_comb begin + // Tail-agnostic bus + alu_result = '1; + alu_result_vm = '1; + alu_result_vm_m = '1; + alu_result_vm_shuf = '1; + alu_result_vmsif_vm = '1; + alu_result_vmsbf_vm = '1; + alu_result_vmsof_vm = '1; + alu_result_vm = '1; + + vcpop_operand = '0; + + // The result mask should be created here since the output is a non-mask vector + be_viota_seq_d = be_viota_seq_q; + + // Create a bit-masked ALU sequential vector + masku_operand_alu_seq_m = masku_operand_alu_seq + & (masku_operand_m_seq | {NrLanes*DataWidth{vinsn_issue.vm}}); + + // VMSBF, VMSIF, VMSOF default assignments + found_one = found_one_q; + found_one_d = found_one_q; + vmsbf_buffer = '0; + // VIOTA default assignments + viota_acc = viota_acc_q; + viota_acc_d = viota_acc_q; + for (int i = 0; i < ViotaParallelism; i++) viota_res[i] = '0; + if (vinsn_issue_valid) begin // Evaluate the instruction unique case (vinsn_issue.op) inside - [VMANDNOT:VMXNOR]: alu_result = masku_operand_alu; - [VMFEQ:VMSGTU], [VMSGT:VMSBC]: alu_result = alu_result_compressed & bit_enable_mask; + // Mask logical: pass through the result already computed in the ALU + // This operation is never masked + // This operation always writes to multiple of VRF words, and it does not need vd + // This operation can overwrite the destination register without constraints on tail elements + [VMANDNOT:VMXNOR]: alu_result_vm_m = masku_operand_alu_seq; + // Comparisons: mask out the masked out bits of this pre-computed slice + [VMFEQ:VMSGT]: alu_result_vm_m = alu_result_compressed_seq + | ~(masku_operand_m_seq | {NrLanes*DataWidth{vinsn_issue.vm}}); + // Add/sub-with-carry/borrow: the masks are all 1 since these operations are NOT masked + [VMADC:VMSBC]: alu_result_vm_m = alu_result_compressed_seq; + // VMSBF, VMSOF, VMSIF: compute a slice of the output and mask out the masked out bits [VMSBF:VMSIF] : begin - if (&masku_operand_vs2_seq_valid && (&masku_operand_m_valid || vinsn_issue.vm)) begin - for (int i = 0; i < NrLanes * DataWidth; i++) begin - if (masku_operand_vs2_seq[i] == 1'b0) begin - alu_result_vm[i] = (vinsn_issue.op == VMSOF) ? 1'b0 : not_found_one_d; - end else begin - not_found_one_d = 1'b0; - alu_result_vm[i] = (vinsn_issue.op == VMSBF) ? not_found_one_d : 1'b1; - break; - end - end - alu_result_vm_m = (!vinsn_issue.vm) ? alu_result_vm & bit_enable_mask : alu_result_vm; - end else begin - alu_result_vm = '0; - end - end - VIOTA: begin - if (&masku_operand_alu_valid) begin - masku_operand_alu_seq_m = masku_operand_alu_seq & bit_enable_mask; - unique case (vinsn_issue.vtype.vsew) - EW8 : begin - if (issue_cnt_q < vinsn_issue.vl) begin - alu_result_vm [7:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:8] + alu_result_ff [(NrLanes*DataWidth)-1-:8]; - end else begin - alu_result_vm [7:0] = '0; - end - for (int index = 1; index < (NrLanes*DataWidth)/8; index++) begin - alu_result_vm [(index*8) +: 7] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*8) +: 7]; - alu_result_vm_m [(index*8) +: 7] = alu_result_vm [(index*8) +: 7]; - end - end - EW16: begin - if (issue_cnt_q < vinsn_issue.vl) begin - alu_result_vm [15:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:16] + alu_result_ff [(NrLanes*DataWidth)-1-:16]; - end else begin - alu_result_vm [15:0] = '0; - end - for (int index = 1; index < (NrLanes*DataWidth)/16; index++) begin - alu_result_vm [(index*16) +: 15] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*16) +: 15]; - alu_result_vm_m [(index*16) +: 15] = alu_result_vm [(index*16) +: 15]; - end - end - EW32: begin - if (issue_cnt_q < vinsn_issue.vl) begin - alu_result_vm [31:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:32] + alu_result_ff [(NrLanes*DataWidth)-1-:32]; - end else begin - alu_result_vm [31:0] = '0; - end - for (int index = 1; index < (NrLanes*DataWidth)/32; index++) begin - alu_result_vm [(index*32) +: 31] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*32) +: 31]; - alu_result_vm_m [(index*32) +: 31] = alu_result_vm [(index*32) +: 31]; - end - end - EW64: begin - if (issue_cnt_q < vinsn_issue.vl) begin - alu_result_vm [63:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:64] + alu_result_ff [(NrLanes*DataWidth)-1-:64]; - end else begin - alu_result_vm [63:0] = '0; - end - for (int index = 1; index < (NrLanes*DataWidth)/64; index++) begin - alu_result_vm [(index*64) +: 63] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*64) +: 63]; - alu_result_vm_m [(index*64) +: 63] = alu_result_vm [(index*64) +: 63]; - end - end - endcase + vmsbf_buffer[0] = ~(masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism] | found_one_q); + for (int i = 1; i < VmsxfParallelism; i++) begin + vmsbf_buffer[i] = ~((masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism + i]) | ~vmsbf_buffer[i-1]); end + // Have we found a 1 in the current slice? + found_one = |(masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism]) | found_one_q; + + alu_result_vmsbf_vm[out_valid_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism] = vmsbf_buffer; + alu_result_vmsif_vm[out_valid_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism] = {vmsbf_buffer[VmsxfParallelism-2:0], ~found_one_q}; + alu_result_vmsof_vm[out_valid_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism] = ~vmsbf_buffer & {vmsbf_buffer[VmsxfParallelism-2:0], ~found_one_q}; + + unique case (vinsn_issue.op) + VMSBF: alu_result_vm = alu_result_vmsbf_vm; + VMSIF: alu_result_vm = alu_result_vmsif_vm; + // VMSOF + default: alu_result_vm = alu_result_vmsof_vm; + endcase + + // Mask the result + alu_result_vm_m = (!vinsn_issue.vm) || (vinsn_issue.op inside {[VMADC:VMSBC]}) ? alu_result_vm | ~masku_operand_m_seq : alu_result_vm; end - VID: begin - if (&masku_operand_alu_valid) begin - unique case (vinsn_issue.vtype.vsew) - EW8 : begin - for (int index = 1; index < (NrLanes*DataWidth)/8; index++) begin - alu_result_vm [(index*8) +: 7] = (((NrLanes * DataWidth)/8) >= vinsn_issue.vl) ? index : index-(((vinsn_issue.vl/((NrLanes * DataWidth)/8))-iteration_count_d)*32); - alu_result_vm_m = alu_result_vm & mask; - end - end - EW16: begin - for (int index = 1; index < (NrLanes*DataWidth)/16; index++) begin - alu_result_vm [(index*16) +: 15] = (((NrLanes * DataWidth)/8) >= vinsn_issue.vl) ? index : index-(((vinsn_issue.vl/((NrLanes * DataWidth)/8))-iteration_count_d)*16); - alu_result_vm_m = alu_result_vm & mask; - end - end - EW32: begin - for (int index = 1; index < (NrLanes*DataWidth)/32; index++) begin - alu_result_vm [(index*32) +: 31] = (((NrLanes * DataWidth)/8) >= vinsn_issue.vl) ? index : index-(((vinsn_issue.vl/((NrLanes * DataWidth)/8))-iteration_count_d)*8); - alu_result_vm_m = alu_result_vm & mask; - end - end - EW64: begin - for (int index = 1; index < (NrLanes*DataWidth)/64; index++) begin - alu_result_vm [(index*64) +: 63] = (((NrLanes * DataWidth)/8) >= vinsn_issue.vl) ? index : index-(((vinsn_issue.vl/((NrLanes * DataWidth)/8))-iteration_count_d)*4); - alu_result_vm_m = alu_result_vm & mask; - end - end - endcase + // VIOTA, VID: compute a slice of the output and mask out the masked elements + // VID re-uses the VIOTA datapath + VIOTA, VID: begin + // Mask the input vector + // VID uses the same datapath of VIOTA, but with implicit input vector at '1 + masku_operand_alu_seq_m = (vinsn_issue.op == VID) + ? '1 // VID mask does NOT modify the count + : masku_operand_alu_seq + & (masku_operand_m_seq | {NrLanes*DataWidth{vinsn_issue.vm}}); // VIOTA mask DOES modify the count + + // Compute output results on `ViotaParallelism 16-bit adders + viota_res[0] = viota_acc_q; + for (int i = 0; i < ViotaParallelism - 1; i++) begin + viota_res[i+1] = viota_res[i] + masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i]; end + viota_acc = viota_res[ViotaParallelism-1] + masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + ViotaParallelism - 1]; + + // This datapath should be relativeley simple: + // `ViotaParallelism bytes connected, in line, to output byte chunks + // Multiple limited-width counters should help the synthesizer reduce wiring + unique case (vinsn_issue.vtype.vsew) + EW8: for (int i = 0; i < ViotaParallelism; i++) begin + alu_result_vm_m[out_valid_cnt_q[idx_width(NrLanes*DataWidth/8/ViotaParallelism)-1:0] * ViotaParallelism * 8 + i*8 +: 8] = viota_res[i][7:0]; + end + EW16: for (int i = 0; i < ViotaParallelism; i++) begin + alu_result_vm_m[out_valid_cnt_q[idx_width(NrLanes*DataWidth/16/ViotaParallelism)-1:0] * ViotaParallelism * 16 + i*16 +: 16] = viota_res[i]; + end + EW32: for (int i = 0; i < ViotaParallelism; i++) begin + alu_result_vm_m[out_valid_cnt_q[idx_width(NrLanes*DataWidth/32/ViotaParallelism)-1:0] * ViotaParallelism * 32 + i*32 +: 32] = {{32{1'b0}}, viota_res[i]}; + end + default: for (int i = 0; i < ViotaParallelism; i++) begin // EW64 + alu_result_vm_m[out_valid_cnt_q[idx_width(NrLanes*DataWidth/64/ViotaParallelism)-1:0] * ViotaParallelism * 64 + i*64 +: 64] = {{48{1'b0}}, viota_res[i]}; + end + endcase + + // BE signal for VIOTA,VID + unique case (vinsn_issue.vtype.vsew) + EW8: for (int i = 0; i < ViotaParallelism; i++) begin + be_viota_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/8/ViotaParallelism)-1:0] * ViotaParallelism * 1 + 1*i +: 1] = + {1{vinsn_issue.vm}} | {1{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i]}}; + end + EW16: for (int i = 0; i < ViotaParallelism; i++) begin + be_viota_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/16/ViotaParallelism)-1:0] * ViotaParallelism * 2 + 2*i +: 2] = + {2{vinsn_issue.vm}} | {2{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i]}}; + end + EW32: for (int i = 0; i < ViotaParallelism; i++) begin + be_viota_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/32/ViotaParallelism)-1:0] * ViotaParallelism * 4 + 4*i +: 4] = + {4{vinsn_issue.vm}} | {4{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i]}}; + end + default: for (int i = 0; i < ViotaParallelism; i++) begin // EW64 + be_viota_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/64/ViotaParallelism)-1:0] * ViotaParallelism * 8 + 8*i +: 8] = + {8{vinsn_issue.vm}} | {8{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i]}}; + end + endcase end + // VCPOP, VFIRST: mask the current slice and feed the popc or lzc unit [VCPOP:VFIRST] : begin - vcpop_operand = (!vinsn_issue.vm) ? masku_operand_vs2_seq & bit_enable_mask : masku_operand_vs2_seq; - end - default: begin - alu_result = '0; - alu_result_vm = '0; + vcpop_operand = (!vinsn_issue.vm) ? masku_operand_alu_seq & masku_operand_m_seq : masku_operand_alu_seq; end + default:; endcase end - // Shuffle result for masked instructions + // Shuffle the sequential result with vtype.vsew encoding + for (int b = 0; b < (NrLanes*StrbWidth); b++) begin + automatic int shuffle_byte = shuffle_index(b, NrLanes, vinsn_issue.vtype.vsew); + alu_result_vm_shuf[8*shuffle_byte +: 8] = alu_result_vm_m[8*b +: 8]; + end + + // Shuffle the VIOTA, VID byte enable signal + be_viota_shuf = '0; for (int b = 0; b < (NrLanes*StrbWidth); b++) begin - automatic int shuffle_byte = shuffle_index(b, NrLanes, vinsn_issue.vtype.vsew); - alu_result_vm_seq[8*shuffle_byte +: 8] = alu_result_vm_m[8*b +: 8]; + automatic int shuffle_byte = shuffle_index(b, NrLanes, vinsn_issue.vtype.vsew); + be_viota_shuf[shuffle_byte] = be_viota_seq_d[b]; end - // alu_result propagation mux - if (vinsn_issue.op inside {[VMSBF:VID]}) - alu_result = alu_result_vm_seq; + // Simplify layout handling + alu_result = alu_result_vm_shuf; - end: p_mask_alu + // Prepare the background data with vtype.vsew encoding + result_queue_mask_seq = vinsn_issue.op inside {[VIOTA:VID]} ? '0 : masku_operand_m_seq | {NrLanes*DataWidth{vinsn_issue.vm}} | {NrLanes*DataWidth{vinsn_issue.op inside {[VMADC:VMSBC]}}}; + background_data_init_seq = masku_operand_vd_seq | result_queue_mask_seq; + background_data_init_shuf = '0; + for (int b = 0; b < (NrLanes*StrbWidth); b++) begin + automatic int shuffle_byte = shuffle_index(b, NrLanes, vinsn_issue.vtype.vsew); + background_data_init_shuf[8*shuffle_byte +: 8] = background_data_init_seq[8*b +: 8]; + end ///////////////// // Mask unit // ///////////////// - // Vector instructions currently running - logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q; - - // Interface with the main sequencer - pe_resp_t pe_resp; - - // Effective MASKU stride in case of VSLIDEUP - // MASKU receives chunks of 64 * NrLanes mask bits from the lanes - // VSLIDEUP only needs the bits whose index >= than its stride - // So, the operand requester does not send vl mask bits to MASKU - // and trims all the unused 64 * NrLanes mask bits chunks - // Therefore, the stride needs to be trimmed, too - elen_t trimmed_stride; - - logic [NrLanes-1:0] fake_a_valid; - logic last_incoming_a; - logic unbalanced_a; - - // Control signals for better code-readability (this signals goes high if a result is valid and can be pushed to the result_queue) - logic vreg_wb_valid; - - // Information about which is the target FU of the request - assign masku_operand_fu = (vinsn_issue.op inside {[VMFEQ:VMFGE]}) ? MaskFUMFpu : MaskFUAlu; - - // Byte enable for the result queue - logic [NrLanes*ELENB-1:0] result_queue_be_seq; - logic [NrLanes*ELENB-1:0] result_queue_be; - - always_comb begin: p_masku // Maintain state - vinsn_queue_d = vinsn_queue_q; - read_cnt_d = read_cnt_q; - issue_cnt_d = issue_cnt_q; - commit_cnt_d = commit_cnt_q; + vinsn_queue_d = vinsn_queue_q; + read_cnt_d = read_cnt_q; + issue_cnt_d = issue_cnt_q; + processing_cnt_d = processing_cnt_q; + commit_cnt_d = commit_cnt_q; mask_pnt_d = mask_pnt_q; vrf_pnt_d = vrf_pnt_q; - vcpop_slice_cnt_d = vcpop_slice_cnt_q; popcount_d = popcount_q; vfirst_count_d = vfirst_count_q; @@ -634,13 +741,19 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( trimmed_stride = pe_req_i.stride; + out_vrf_word_valid = 1'b0; + out_scalar_valid = 1'b0; + // Vector instructions currently running vinsn_running_d = vinsn_running_q & pe_vinsn_running_i; + // Mask the response, by default + pe_resp = '0; + // We are not ready, by default - pe_resp = '0; - masku_operand_alu_ready = '0; - masku_operand_m_ready = '0; + masku_operand_alu_ready = '0; + masku_operand_m_ready = '0; + masku_operand_vd_ready = '0; // Inform the main sequencer if we are idle pe_req_ready_o = !vinsn_queue_full; @@ -649,329 +762,110 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( result_scalar_d = result_scalar_o; result_scalar_valid_d = result_scalar_valid_o; - // Balance the incoming valid - unbalanced_a = (|commit_cnt_q[idx_width(NrLanes)-1:0] != 1'b0) ? 1'b1 : 1'b0; - last_incoming_a = ((commit_cnt_q - vrf_pnt_q) < NrLanes) ? 1'b1 : 1'b0; - fake_a_valid[0] = 1'b0; - for (int unsigned i = 1; i < NrLanes; i++) - if (i >= {1'b0, commit_cnt_q[idx_width(NrLanes)-1:0]}) - fake_a_valid[i] = last_incoming_a & unbalanced_a; - else - fake_a_valid = 1'b0; + // Don't handshake the inputs + in_ready_cnt_en = 1'b0; + in_m_ready_cnt_en = 1'b0; + out_valid_cnt_en = 1'b0; + + // Result queue background data + for (int unsigned lane = 0; lane < NrLanes; lane++) + result_queue_background_data[lane] = result_queue_q[result_queue_write_pnt_q][lane].wdata; + + // Maintain state + delta_elm_d = delta_elm_q; + in_ready_threshold_d = in_ready_threshold_q; + in_m_ready_threshold_d = in_m_ready_threshold_q; + out_valid_threshold_d = out_valid_threshold_q; + + in_ready_cnt_clr = 1'b0; + in_m_ready_cnt_clr = 1'b0; + out_valid_cnt_clr = 1'b0; + iteration_cnt_clr = 1'b0; ///////////////////// // Mask Operands // ///////////////////// - // Is there an instruction ready to be issued? - if (vinsn_issue_valid && !(vd_scalar(vinsn_issue.op))) begin - // Is there place in the mask queue to write the mask operands? - // Did we receive the mask bits on the MaskM channel? - if (!vinsn_issue.vm && !mask_queue_full && &masku_operand_m_valid && !(vinsn_issue.op inside {VMSBF, VMSOF, VMSIF})) begin - // Copy data from the mask operands into the mask queue - for (int vrf_seq_byte = 0; vrf_seq_byte < NrLanes*StrbWidth; vrf_seq_byte++) begin - // Map vrf_seq_byte to the corresponding byte in the VRF word. - automatic int vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue.vtype.vsew); - - // At which lane, and what is the byte offset in that lane, of the byte vrf_byte? - // NOTE: This does not work if the number of lanes is not a power of two. - // If that is needed, the following two lines must be changed accordingly. - automatic int vrf_lane = vrf_byte >> $clog2(StrbWidth); - automatic int vrf_offset = vrf_byte[idx_width(StrbWidth)-1:0]; - - // The VRF pointer can be broken into a byte offset, and a bit offset - automatic int vrf_pnt_byte_offset = mask_pnt_q >> $clog2(StrbWidth); - automatic int vrf_pnt_bit_offset = mask_pnt_q[idx_width(StrbWidth)-1:0]; - - // A single bit from the mask operands can be used several times, depending on the eew. - automatic int mask_seq_bit = vrf_seq_byte >> int'(vinsn_issue.vtype.vsew); - automatic int mask_seq_byte = (mask_seq_bit >> $clog2(StrbWidth)) + vrf_pnt_byte_offset; - // Shuffle this source byte - automatic int mask_byte = shuffle_index(mask_seq_byte, NrLanes, vinsn_issue.eew_vmask); - // Account for the bit offset - automatic int mask_bit = (mask_byte << $clog2(StrbWidth)) + - mask_seq_bit[idx_width(StrbWidth)-1:0] + vrf_pnt_bit_offset; - - // At which lane, and what is the bit offset in that lane, of the mask operand from - // mask_seq_bit? - automatic int mask_lane = mask_bit >> idx_width(DataWidth); - automatic int mask_offset = mask_bit[idx_width(DataWidth)-1:0]; - - // Copy the mask operand - mask_queue_d[mask_queue_write_pnt_q][vrf_lane][vrf_offset] = - masku_operand_m[mask_lane][mask_offset]; - end - - // Account for the used operands - mask_pnt_d += NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew)); - - // Increment result queue pointers and counters - mask_queue_cnt_d += 1; - if (mask_queue_write_pnt_q == MaskQueueDepth-1) - mask_queue_write_pnt_d = '0; - else - mask_queue_write_pnt_d = mask_queue_write_pnt_q + 1; - - // Account for the operands that were issued - read_cnt_d = read_cnt_q - NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew)); - if (read_cnt_q < NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew))) - read_cnt_d = '0; - - // Trigger the request signal - mask_queue_valid_d[mask_queue_write_pnt_q] = {NrLanes{1'b1}}; - - // Are there lanes with no valid elements? - // If so, mute their request signal - if (read_cnt_q < NrLanes) - mask_queue_valid_d[mask_queue_write_pnt_q] = (1 << read_cnt_q) - 1; - - // Consumed all valid bytes from the lane operands - if (mask_pnt_d == NrLanes*64 || read_cnt_d == '0) begin - // Request another beat - masku_operand_m_ready = '1; - // Reset the pointer - mask_pnt_d = '0; - end + // Instructions that run in other units, but need mask strobes for predicated execution + + // Is there space in the result queue? + if (!mask_queue_full) begin + // Copy data from the mask operands into the mask queue + for (int vrf_seq_byte = 0; vrf_seq_byte < NrLanes*StrbWidth; vrf_seq_byte++) begin + // Map vrf_seq_byte to the corresponding byte in the VRF word. + automatic int vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue.vtype.vsew); + + // At which lane, and what is the byte offset in that lane, of the byte vrf_byte? + // NOTE: This does not work if the number of lanes is not a power of two. + // If that is needed, the following two lines must be changed accordingly. + automatic int vrf_lane = vrf_byte >> $clog2(StrbWidth); + automatic int vrf_offset = vrf_byte[idx_width(StrbWidth)-1:0]; + + // The VRF pointer can be broken into a byte offset, and a bit offset + automatic int vrf_pnt_byte_offset = mask_pnt_q >> $clog2(StrbWidth); + automatic int vrf_pnt_bit_offset = mask_pnt_q[idx_width(StrbWidth)-1:0]; + + // A single bit from the mask operands can be used several times, depending on the eew. + automatic int mask_seq_bit = vrf_seq_byte >> int'(vinsn_issue.vtype.vsew); + automatic int mask_seq_byte = (mask_seq_bit >> $clog2(StrbWidth)) + vrf_pnt_byte_offset; + // Shuffle this source byte + automatic int mask_byte = shuffle_index(mask_seq_byte, NrLanes, vinsn_issue.eew_vmask); + // Account for the bit offset + automatic int mask_bit = (mask_byte << $clog2(StrbWidth)) + + mask_seq_bit[idx_width(StrbWidth)-1:0] + vrf_pnt_bit_offset; + + // At which lane, and what is the bit offset in that lane, of the mask operand from + // mask_seq_bit? + automatic int mask_lane = mask_bit >> idx_width(DataWidth); + automatic int mask_offset = mask_bit[idx_width(DataWidth)-1:0]; + + // Copy the mask operand + mask_queue_d[mask_queue_write_pnt_q][vrf_lane][vrf_offset] = + masku_operand_m[mask_lane][mask_offset]; end - end - - ////////////////////////////// - // Calculate scalar results // - ////////////////////////////// - - vcpop_vfirst_vs2_ready = 1'b0; - - // Is there an instruction ready to be issued? - if (vinsn_issue_valid && vd_scalar(vinsn_issue.op)) begin - if (&(masku_operand_vs2_seq_valid | fake_a_valid) && (&masku_operand_m_valid || vinsn_issue.vm)) begin - - // increment slice counter - vcpop_slice_cnt_d = vcpop_slice_cnt_q + 1'b1; - - // request new operand (by completing ready-valid handshake) once all slices have been processed - vcpop_vfirst_vs2_ready = 1'b0; - if (((vcpop_slice_cnt_q == N_SLICES_CPOP - 1) && vinsn_issue.op == VCPOP) || - ((vcpop_slice_cnt_q == N_SLICES_VFIRST-1) && vinsn_issue.op == VFIRST)) begin - vcpop_slice_cnt_d = '0; - vcpop_vfirst_vs2_ready = masku_operand_vs2_seq_valid; - if (!vinsn_issue.vm) begin - masku_operand_m_ready = '1; - end - end - // Account for the elements that were processed - issue_cnt_d = issue_cnt_q - W_CPOP; - - // abruptly stop processing elements if vl is reached - if (iteration_count_d >= (vinsn_issue.vl/(W_CPOP)) || (!vfirst_empty && (vinsn_issue.op == VFIRST))) begin - issue_cnt_d = '0; - commit_cnt_d = '0; - read_cnt_d ='0; - vcpop_vfirst_vs2_ready = masku_operand_vs2_seq_valid; - if (!vinsn_issue.vm) begin - masku_operand_m_ready = '1; - end - end - - popcount_d = popcount_q + popcount; - vfirst_count_d = vfirst_count_q + vfirst_count; - - // if this is the last beat, commit the result to the scalar_result queue - if ((iteration_count_d >= (vinsn_issue.vl/W_CPOP) && vinsn_issue.op == VCPOP) || - (iteration_count_d >= (vinsn_issue.vl/W_VFIRST) && vinsn_issue.op == VFIRST) || - (!vfirst_empty && (vinsn_issue.op == VFIRST))) begin - result_scalar_d = (vinsn_issue.op == VCPOP) ? popcount_d : (vfirst_empty) ? -1 : vfirst_count_d; - result_scalar_valid_d = '1; - - // Decrement the commit counter by the entire number of elements, - // since we only commit one result for everything - commit_cnt_d = '0; - - // reset vcpop slice counter, since instruction is finished - vcpop_slice_cnt_d = '0; - - // acknowledge operand a - vcpop_vfirst_vs2_ready = masku_operand_vs2_seq_valid; - if (!vinsn_issue.vm) begin + // Is there an instruction ready to be issued? + if (vinsn_issue_valid && ((vinsn_issue.vfu != VFU_MaskUnit) || (vinsn_issue.op inside {[VMADC:VMSBC]}))) begin + // Is there place in the mask queue to write the mask operands? + // Did we receive the mask bits on the MaskM channel? + if (!vinsn_issue.vm && &masku_operand_m_valid) begin + // Account for the used operands + mask_pnt_d += NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew)); + + // Increment result queue pointers and counters + mask_queue_cnt_d += 1; + if (mask_queue_write_pnt_q == MaskQueueDepth-1) + mask_queue_write_pnt_d = '0; + else + mask_queue_write_pnt_d = mask_queue_write_pnt_q + 1; + + // Account for the operands that were issued + read_cnt_d = read_cnt_q - NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew)); + if (read_cnt_q < NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew))) + read_cnt_d = '0; + + // Trigger the request signal + mask_queue_valid_d[mask_queue_write_pnt_q] = {NrLanes{1'b1}}; + + // Are there lanes with no valid elements? + // If so, mute their request signal + if (read_cnt_q < NrLanes) + mask_queue_valid_d[mask_queue_write_pnt_q] = (1 << read_cnt_q) - 1; + + // Consumed all valid bytes from the lane operands + if (mask_pnt_d == NrLanes*DataWidth || read_cnt_d == '0) begin + // Request another beat masku_operand_m_ready = '1; + // Reset the pointer + mask_pnt_d = '0; end end end end - ////////////////////////////////// - // Write results to the lanes // - ////////////////////////////////// - - result_queue_be = '1; - result_queue_be_seq = '1; - vmsif_vmsof_vmsbf_vs2_ready = '0; - - // Is there an instruction ready to be issued? - if (vinsn_issue_valid && !vd_scalar(vinsn_issue.op)) begin - // This instruction executes on the Mask Unit - if (vinsn_issue.vfu == VFU_MaskUnit) begin - // Is there place in the result queue to write the results? - // Did we receive the operands? - if (!result_queue_full && (&(masku_operand_alu_valid | fake_a_valid | masku_operand_vs2_seq_valid))) begin - // How many elements are we committing in total? - // Since we are committing bits instead of bytes, we carry out the following calculation - // with ceil(vl/8) instead. - automatic int element_cnt_all_lanes = (ELENB * NrLanes) >> int'(vinsn_issue.vtype.vsew); - // How many elements are remaining to be committed? Carry out the calculation with - // ceil(issue_cnt/8). - automatic int remaining_element_cnt_all_lanes = (issue_cnt_q + 7) / 8; - remaining_element_cnt_all_lanes = (remaining_element_cnt_all_lanes + - (1 << int'(vinsn_issue.vtype.vsew)) - 1) >> int'(vinsn_issue.vtype.vsew); - if (element_cnt_all_lanes > remaining_element_cnt_all_lanes) - element_cnt_all_lanes = remaining_element_cnt_all_lanes; - - // Acknowledge the operands of this instruction. - // At this stage, acknowledge only the first operand, "a", coming from the ALU/VMFpu. - masku_operand_alu_ready = masku_operand_alu_valid; - vmsif_vmsof_vmsbf_vs2_ready = (&masku_operand_m_valid || vinsn_issue.vm) ? '1 : '0; - - if (!vinsn_issue.vm) begin - unique case (vinsn_issue.vtype.vsew) - EW8 : result_queue_be_seq = masku_operand_m_seq[NrLanes*ELENB-1:0]; - EW16: begin - for (int i = 0; i < NrLanes * ELENB / 2; i++) begin - result_queue_be_seq[2*i +: 2] = {2{bit_enable_mask[i]}}; - end - end - EW32: begin - for (int i = 0; i < NrLanes * ELENB / 4; i++) begin - result_queue_be_seq[4*i +: 4] = {4{bit_enable_mask[i]}}; - end - end - EW64: begin - for (int i = 0; i < NrLanes * ELENB / 8; i++) begin - result_queue_be_seq[8*i +: 8] = {8{bit_enable_mask[i]}}; - end - end - default: ; // Not sure what should be the default - endcase - for (int i = 0; i < NrLanes*ELENB; i++) begin - result_queue_be[shuffle_index(i, NrLanes, vinsn_issue.vtype.vsew)] = result_queue_be_seq[i]; - end - end - - if (vinsn_issue.op inside {[VMSBF: VMSIF], VID}) begin - result_queue_be = '1; - end - - // Store the result in the operand queue - for (int unsigned lane = 0; lane < NrLanes; lane++) begin - // How many elements are we committing in this lane? - automatic int element_cnt = element_cnt_all_lanes / NrLanes; - if (lane < element_cnt_all_lanes[idx_width(NrLanes)-1:0]) - element_cnt += 1; - - result_queue_d[result_queue_write_pnt_q][lane] = '{ - wdata: result_queue_q[result_queue_write_pnt_q][lane].wdata | alu_result[lane], - be : (vinsn_issue.op inside {[VMSBF:VID]}) ? result_queue_be[lane*ELENB +: ELENB] : be(element_cnt, vinsn_issue.vtype.vsew), - addr : (vinsn_issue.op inside {[VIOTA:VID]}) ? vaddr(vinsn_issue.vd, NrLanes, VLEN) + ((vinsn_issue.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue.vtype.vsew)) : vaddr(vinsn_issue.vd, NrLanes, VLEN) + - (((vinsn_issue.vl - issue_cnt_q) / NrLanes / DataWidth)), - id : vinsn_issue.id - }; - end - - // Increment the VRF pointer - if (vinsn_issue.op inside {[VMFEQ:VMSGTU], [VMSGT:VMSBC]}) begin - vrf_pnt_d = vrf_pnt_q + (NrLanes << (int'(EW64) - vinsn_issue.vtype.vsew)); - - // Filled-up a word, or finished execution - if (vrf_pnt_d == DataWidth*NrLanes || vrf_pnt_d >= issue_cnt_q) begin - result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; - - // Reset VRF pointer - vrf_pnt_d = '0; - - // Increment result queue pointers and counters - result_queue_cnt_d += 1; - if (result_queue_write_pnt_q == ResultQueueDepth-1) - result_queue_write_pnt_d = '0; - else - result_queue_write_pnt_d = result_queue_write_pnt_q + 1; - - // Account for the results that were issued - issue_cnt_d = issue_cnt_q - NrLanes * DataWidth; - if (issue_cnt_q < NrLanes * DataWidth) - issue_cnt_d = '0; - end - end else if (vinsn_issue.op inside {[VMSBF:VID]}) begin - if (&masku_operand_m_valid || vinsn_issue.vm || vinsn_issue.op inside {VIOTA, VID}) begin - result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; - - // Increment result queue pointers and counters - result_queue_cnt_d += 1; - if (result_queue_write_pnt_q == ResultQueueDepth-1) - result_queue_write_pnt_d = '0; - else - result_queue_write_pnt_d = result_queue_write_pnt_q + 1; - - if (result_queue_read_pnt_q == ResultQueueDepth-1) - result_queue_read_pnt_d = '0; - else - result_queue_read_pnt_d = result_queue_read_pnt_m; - - // Account for the results that were issued - if (vinsn_issue.op inside {VIOTA, VID}) begin - issue_cnt_d = issue_cnt_q - (NrLanes << (int'(EW64) - vinsn_issue.vtype.vsew)); - if ((vinsn_issue.vl-issue_cnt_d) >= vinsn_issue.vl) - issue_cnt_d = '0; - end else begin - issue_cnt_d = issue_cnt_q - NrLanes * DataWidth; - if ((vinsn_issue.vl-issue_cnt_d) >= vinsn_issue.vl) - issue_cnt_d = '0; - end - end - end else begin - result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; - // Increment result queue pointers and counters - result_queue_cnt_d += 1; - if (result_queue_write_pnt_q == ResultQueueDepth-1) - result_queue_write_pnt_d = '0; - else - result_queue_write_pnt_d = result_queue_write_pnt_q + 1; - - // Account for the results that were issued - issue_cnt_d = issue_cnt_q - NrLanes * DataWidth; - if (issue_cnt_q < NrLanes * DataWidth) - issue_cnt_d = '0; - end - end - end - end - - /////////////////////////// - //// Masked Instruction /// - /////////////////////////// - if ((|masku_operand_alu_valid && !result_queue_full) && (&masku_operand_m_valid || vinsn_issue.vm) && vinsn_commit_valid && vinsn_commit.op inside {[VIOTA:VID]}) begin - // if this is the last beat, commit the result to the scalar_result queue - commit_cnt_d = commit_cnt_q - (NrLanes << (int'(EW64) - vinsn_commit.vtype.vsew)); - if ((vinsn_commit.vl-commit_cnt_d) >= vinsn_commit.vl) begin - commit_cnt_d = '0; - end - end - if ((&masku_operand_alu_valid || &masku_operand_vs2_seq_valid) && (&masku_operand_m_valid || vinsn_issue.vm) && vinsn_commit_valid && vinsn_commit.op inside {VMSBF, VMSOF, VMSIF}) begin - commit_cnt_d = commit_cnt_q - NrLanes * DataWidth; - if ((vinsn_commit.vl-commit_cnt_d) >= vinsn_commit.vl) begin - commit_cnt_d = '0; - end - end - - // Finished issuing results - if (vinsn_issue_valid && ( - ( (vinsn_issue.vm || vinsn_issue.vfu == VFU_MaskUnit) && issue_cnt_d == '0) || - (!(vinsn_issue.vm || vinsn_issue.vfu == VFU_MaskUnit) && read_cnt_d == '0))) begin - // Increment vector instruction queue pointers and counters - vinsn_queue_d.issue_cnt -= 1; - end - - ///////////////////////////////// - // Send operands to the VFUs // - ///////////////////////////////// + ////////////////////////////////////// + // Send Mask Operands to the VFUs // + ////////////////////////////////////// for (int lane = 0; lane < NrLanes; lane++) begin: send_operand mask_valid_o[lane] = mask_queue_valid_q[mask_queue_read_pnt_q][lane]; @@ -980,7 +874,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // The VLDU and the VSTU acknowledge all the operands at once. // Only accept the acknowledgement from the lanes if the current instruction is executing there. // Deactivate the request, but do not bump the pointers for now. - if ((lane_mask_ready_i[lane] && mask_valid_o[lane] && vinsn_issue.vfu inside {VFU_Alu, VFU_MFpu, VFU_MaskUnit}) || + if ((lane_mask_ready_i[lane] && mask_valid_o[lane] && (vinsn_issue.vfu inside {VFU_Alu, VFU_MFpu} || vinsn_issue.op inside {[VMADC:VMSBC]})) || vldu_mask_ready_i || vstu_mask_ready_i || sldu_mask_ready_i) begin mask_queue_valid_d[mask_queue_read_pnt_q][lane] = 1'b0; mask_queue_d[mask_queue_read_pnt_q][lane] = '0; @@ -990,12 +884,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Is this operand going to the lanes? mask_valid_lane_o = vinsn_issue.vfu inside {VFU_Alu, VFU_MFpu, VFU_MaskUnit}; - if (vd_scalar(vinsn_issue.op)) begin - mask_valid_o = (vinsn_issue.vm) ? '0 : '1; - end - // All lanes accepted the VRF request - if (!(|mask_queue_valid_d[mask_queue_read_pnt_q])) + if (!(|mask_queue_valid_d[mask_queue_read_pnt_q])) begin // There is something waiting to be written if (!mask_queue_empty) begin // Increment the read pointer @@ -1017,10 +907,171 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( commit_cnt_d = '0; end end + end + + /////////////////////// + // MASKU ALU Control // + /////////////////////// + + // Instructions that natively run in the MASKU + + // The main data packets come from the lanes' ALUs. + // Also, mask- and tail-undisturbed policies are implemented by fetching the destination register, + // which is the default value of the result queue. + + // Almost all the operations are time multiplexed. Moreover, some operations (e.g., VIOTA) work on + // different input and output data widths, meaning that the input ready and the final output valid + // are not always synchronized. + + // How many elements {VIOTA|VID} are writing to each lane + elm_per_lane = processing_cnt_q / NrLanes; + if ((processing_cnt_q / NrLanes) > 4'b1000) + elm_per_lane = 4'b1000; + for (int l = 0; l < NrLanes; l++) additional_elm[l] = processing_cnt_q[idx_width(NrLanes)-1:0] > l; + + // Default operand queue assignment + for (int unsigned lane = 0; lane < NrLanes; lane++) begin + result_queue_d[result_queue_write_pnt_q][lane] = '{ + wdata: result_queue_q[result_queue_write_pnt_q][lane].wdata, // Retain the last-cycle's data + // VIOTA, VID generate a non-mask vector and should comply with undisturbed policy + // This means that we can use the byte-enable signal + be : vinsn_issue.op inside {[VIOTA:VID]} ? be(elm_per_lane + additional_elm[lane], vinsn_issue.vtype.vsew) & be_viota_shuf[lane*StrbWidth +: StrbWidth] : '1, + addr : vaddr(vinsn_issue.vd, NrLanes, VLEN) + iteration_cnt_q, + id : vinsn_issue.id + }; + end + + // Is there an instruction ready to be issued? + if (vinsn_issue_valid && vinsn_issue.op inside {[VMFEQ:VMXNOR]}) begin + // Compute one slice if we can write and the necessary inputs are valid + if (!result_queue_full && (&masku_operand_alu_valid || vinsn_issue.op == VID) + && (&masku_operand_vd_valid || !vinsn_issue.use_vd_op) + && (&masku_operand_m_valid || vinsn_issue.vm || vinsn_issue.op inside {[VMADC:VMSBC]})) begin + + // Write the result queue on the background data - either vd or the previous result + // The mask vector writes at 1 (tail-agnostic ok value) both the background body + // elements that will be written by the MASKU ALU and the tail elements. + for (int unsigned lane = 0; lane < NrLanes; lane++) begin + result_queue_background_data[lane] = (out_valid_cnt_q != '0) + ? result_queue_q[result_queue_write_pnt_q][lane].wdata + : vinsn_issue.op inside {[VIOTA:VID]} ? '1 : background_data_init_shuf[lane*DataWidth +: DataWidth]; + end + for (int unsigned lane = 0; lane < NrLanes; lane++) begin + // The alu_result has all the bits at 1 except for the portion of bits to write. + // The masking is already applied in the MASKU ALU. + result_queue_d[result_queue_write_pnt_q][lane].wdata = result_queue_background_data[lane] & alu_result[lane]; + end + // Write the scalar accumulator + popcount_d = popcount_q + popcount; + vfirst_count_d = vfirst_count_q + vfirst_count; + + // Bump MASKU ALU state + found_one_d = found_one; + viota_acc_d = viota_acc; + vrf_pnt_d = vrf_pnt_q + delta_elm_q; + + // Increment the input, input-mask, and output slice counters + in_ready_cnt_en = 1'b1; + in_m_ready_cnt_en = 1'b1; + out_valid_cnt_en = 1'b1; + + // Account for the elements that have been processed + issue_cnt_d = issue_cnt_q - delta_elm_q; + if (issue_cnt_q < delta_elm_q) + issue_cnt_d = '0; + + // Request new input (by completing ready-valid handshake) once all slices have been processed + // Alu input is accessed in different widths + if ((in_ready_cnt_q == in_ready_threshold_q) || (issue_cnt_d == '0)) begin + in_ready_cnt_clr = 1'b1; + if (vinsn_issue.op != VID) begin + masku_operand_alu_ready = '1; + end + end + // Mask is always accessed at bit level + // VMADC, VMSBC handle masks in the mask queue + if ((in_m_ready_cnt_q == in_m_ready_threshold_q) || (issue_cnt_d == '0) && !(vinsn_issue.op inside {[VMADC:VMSBC]})) begin + in_m_ready_cnt_clr = 1'b1; + if (!vinsn_issue.vm) begin + masku_operand_m_ready = '1; + end + end + + // Write to the result queue if the entry is full or if this is the last output + // if this is the last output slice of the vector. + // Also, handshake the vd input, which follows the output. + if ((out_valid_cnt_q == out_valid_threshold_q) || (issue_cnt_d == '0)) begin + out_valid_cnt_clr = 1'b1; + // Handshake vd input + if (vinsn_issue.use_vd_op) begin + masku_operand_vd_ready = '1; + end + // Assert valid result queue output + out_vrf_word_valid = !vd_scalar(vinsn_issue.op); + end + + // The scalar result is valid for write back at the end of the operation. + // VFIRST can also interrupt the operation in advance when the 1 is found. + if (issue_cnt_d == '0 || (!vfirst_empty && (vinsn_issue.op == VFIRST))) begin + // Assert valid scalar output + out_scalar_valid = vd_scalar(vinsn_issue.op); + end + + // Have we finished insn execution? Clear MASKU ALU state + if (issue_cnt_d == '0) begin + be_viota_seq_d = '1; // Default: write + viota_acc_d = '0; + found_one_d = '0; + end + end + end + + ///////////////////// + // Write results // + ///////////////////// + + // Write VRF words to the result queue + if (out_vrf_word_valid) begin + // Write to the lanes + result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; + + // Increment result queue pointers and counters + result_queue_cnt_d += 1; + result_queue_write_pnt_d = result_queue_write_pnt_q + 1; + if (result_queue_write_pnt_q == ResultQueueDepth-1) begin + result_queue_write_pnt_d = '0; + end - ////////////////////////////////// - // Write results into the VRF // - ////////////////////////////////// + // Clear MASKU ALU state + be_viota_seq_d = '0; + + // Account for the written results + // VIOTA and VID do not write bits! + processing_cnt_d = vinsn_issue.op inside {[VIOTA:VID]} ? processing_cnt_q - ((NrLanes * DataWidth / 8) >> vinsn_issue.vtype.vsew) : processing_cnt_q - NrLanes * DataWidth; + end + + // The scalar result has been sent to and acknowledged by the dispatcher + if (out_scalar_valid) begin + result_scalar_d = (vinsn_issue.op == VCPOP) ? popcount_d : ((vfirst_empty) ? -1 : vfirst_count_d); + result_scalar_valid_d = '1; + + // The instruction is over + issue_cnt_d = '0; + processing_cnt_d = '0; + commit_cnt_d = '0; + end + + // Finished issuing results + if (vinsn_issue_valid && ( + ( (vinsn_issue.vm || vinsn_issue.vfu == VFU_MaskUnit) && issue_cnt_d == '0) || + (!(vinsn_issue.vm || vinsn_issue.vfu == VFU_MaskUnit) && read_cnt_d == '0))) begin + // The instruction finished its issue phase + vinsn_queue_d.issue_cnt -= 1; + end + + ////////////// + // Commit // + ////////////// for (int lane = 0; lane < NrLanes; lane++) begin: result_write masku_result_req_o[lane] = result_queue_valid_q[result_queue_read_pnt_q][lane]; @@ -1044,7 +1095,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // All lanes accepted the VRF request if (!(|result_queue_valid_d[result_queue_read_pnt_q]) && - (&result_final_gnt_d || (commit_cnt_q > (NrLanes * DataWidth)))) + (&result_final_gnt_d || (commit_cnt_q > (NrLanes * DataWidth)))) begin // There is something waiting to be written if (!result_queue_empty) begin // Increment the read pointer @@ -1060,41 +1111,59 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( result_queue_d[result_queue_read_pnt_q] = '0; // Decrement the counter of remaining vector elements waiting to be written - if (!(vinsn_issue.op inside {VID, VSE})) begin - commit_cnt_d = commit_cnt_q - NrLanes * DataWidth; - if (commit_cnt_q < (NrLanes * DataWidth)) - commit_cnt_d = '0; + if (!(vinsn_commit.op inside {VSE})) begin + if (vinsn_commit.op inside {[VIOTA:VID]}) begin + commit_cnt_d = commit_cnt_q - ((NrLanes * DataWidth / 8) >> unsigned'(vinsn_commit.vtype.vsew)); + if (commit_cnt_q < ((NrLanes * DataWidth / 8) >> unsigned'(vinsn_commit.vtype.vsew))) + commit_cnt_d = '0; + end else begin + commit_cnt_d = commit_cnt_q - NrLanes * DataWidth; + if (commit_cnt_q < (NrLanes * DataWidth)) + commit_cnt_d = '0; + end end end + end + + // Finished committing the results of a vector instruction + if (vinsn_commit_valid && commit_cnt_d == '0) begin + // Clear the iteration counter + out_valid_cnt_clr = 1'b1; + + // Clear the vrf pointer for comparisons + vrf_pnt_d = '0; + + // Clear the iteration counter + iteration_cnt_clr = 1'b1; + + if(&result_final_gnt_d || vd_scalar(vinsn_commit.op) || vinsn_commit.vfu != VFU_MaskUnit) begin + // Mark the vector instruction as being done + pe_resp.vinsn_done[vinsn_commit.id] = 1'b1; + + // Update the commit counters and pointers + vinsn_queue_d.commit_cnt -= 1; + end + end /////////////////////////// // Commit scalar results // /////////////////////////// - // The scalar result has been sent to and acknowledged by the dispatcher - if (vinsn_commit.op inside {[VCPOP:VFIRST]} && result_scalar_valid_o == 1) begin - - // reset result_scalar + // This is one cycle after asserting out_scalar_valid + // Ara's frontend is always ready to accept the scalar result + if (result_scalar_valid_o) begin + // Reset result_scalar result_scalar_d = '0; result_scalar_valid_d = '0; - // reset the popcount and vfirst_count + // Clear the iteration counter + iteration_cnt_clr = 1'b1; + + // Reset the popcount and vfirst_count popcount_d = '0; vfirst_count_d = '0; end - // Finished committing the results of a vector instruction - // Some instructions forward operands to the lanes before writing the VRF - // In this case, wait for the lanes to be written - if (vinsn_commit_valid && commit_cnt_d == '0 && - (!(vinsn_commit.op inside {[VMFEQ:VID], [VMSGT:VMSBC]}) || &result_final_gnt_d)) begin - // Mark the vector instruction as being done - pe_resp.vinsn_done[vinsn_commit.id] = 1'b1; - - // Update the commit counters and pointers - vinsn_queue_d.commit_cnt -= 1; - end - ////////////////////////////// // Accept new instruction // ////////////////////////////// @@ -1112,12 +1181,14 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Initialize counters if (vinsn_queue_d.issue_cnt == '0) begin - issue_cnt_d = pe_req_i.vl; - read_cnt_d = pe_req_i.vl; + issue_cnt_d = pe_req_i.vl; + processing_cnt_d = pe_req_i.vl; + read_cnt_d = pe_req_i.vl; // Trim skipped words if (pe_req_i.op == VSLIDEUP) begin - issue_cnt_d -= vlen_t'(trimmed_stride); + issue_cnt_d -= vlen_t'(trimmed_stride); + processing_cnt_d -= vlen_t'(trimmed_stride); case (pe_req_i.vtype.vsew) EW8: begin read_cnt_d -= (vlen_t'(trimmed_stride) >> $clog2(NrLanes << 3)) << $clog2(NrLanes << 3); @@ -1139,9 +1210,68 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( endcase end + // Initialize ALU MASKU counters and pointers + unique case (pe_req_i.op) inside + [VMFEQ:VMSGT]: begin + // Mask to mask - encoded + delta_elm_d = NrLanes << (EW64 - pe_req_i.eew_vs2[1:0]); + + in_ready_threshold_d = 0; + in_m_ready_threshold_d = (DataWidth >> (EW64 - pe_req_i.eew_vs2[1:0]))-1; + out_valid_threshold_d = (DataWidth >> (EW64 - pe_req_i.eew_vs2[1:0]))-1; + end + [VMADC:VMSBC]: begin + // Mask to mask - encoded + delta_elm_d = NrLanes << (EW64 - pe_req_i.eew_vs2[1:0]); + + in_ready_threshold_d = 0; + in_m_ready_threshold_d = (DataWidth >> (EW64 - pe_req_i.eew_vs2[1:0]))-1; + out_valid_threshold_d = (DataWidth >> (EW64 - pe_req_i.eew_vs2[1:0]))-1; + end + [VMANDNOT:VMXNOR]: begin + // Mask to mask + delta_elm_d = VmLogicalParallelism; + + in_ready_threshold_d = NrLanes*DataWidth/VmLogicalParallelism-1; + in_m_ready_threshold_d = NrLanes*DataWidth/VmLogicalParallelism-1; + out_valid_threshold_d = NrLanes*DataWidth/VmLogicalParallelism-1; + end + [VMSBF:VMSIF]: begin + // Mask to mask + delta_elm_d = VmsxfParallelism; + + in_ready_threshold_d = NrLanes*DataWidth/VmsxfParallelism-1; + in_m_ready_threshold_d = NrLanes*DataWidth/VmsxfParallelism-1; + out_valid_threshold_d = NrLanes*DataWidth/VmsxfParallelism-1; + end + [VIOTA:VID]: begin + // Mask to non-mask + delta_elm_d = ViotaParallelism; + + in_ready_threshold_d = NrLanes*DataWidth/ViotaParallelism-1; + in_m_ready_threshold_d = NrLanes*DataWidth/ViotaParallelism-1; + out_valid_threshold_d = ((NrLanes*DataWidth/8/ViotaParallelism) >> pe_req_i.vtype.vsew[1:0])-1; + end + VCPOP: begin + // Mask to scalar + delta_elm_d = VcpopParallelism; + + in_ready_threshold_d = NrLanes*DataWidth/VcpopParallelism-1; + in_m_ready_threshold_d = NrLanes*DataWidth/VcpopParallelism-1; + out_valid_threshold_d = '0; + end + default: begin // VFIRST + // Mask to scalar + delta_elm_d = VfirstParallelism; + + in_ready_threshold_d = NrLanes*DataWidth/VfirstParallelism-1; + in_m_ready_threshold_d = NrLanes*DataWidth/VfirstParallelism-1; + out_valid_threshold_d = '0; + end + endcase + // Reset the final grant vector // Be aware: this works only if the insn queue length is 1 - result_final_gnt_d = '0; end if (vinsn_queue_d.commit_cnt == '0) begin @@ -1155,33 +1285,47 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( vinsn_queue_d.issue_cnt += 1; vinsn_queue_d.commit_cnt += 1; end - end: p_masku + end always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin - vinsn_running_q <= '0; - read_cnt_q <= '0; - issue_cnt_q <= '0; - commit_cnt_q <= '0; - vrf_pnt_q <= '0; - mask_pnt_q <= '0; - pe_resp_o <= '0; - result_final_gnt_q <= '0; - vcpop_slice_cnt_q <= '0; - popcount_q <= '0; - vfirst_count_q <= '0; + vinsn_running_q <= '0; + read_cnt_q <= '0; + issue_cnt_q <= '0; + processing_cnt_q <= '0; + commit_cnt_q <= '0; + vrf_pnt_q <= '0; + mask_pnt_q <= '0; + pe_resp_o <= '0; + result_final_gnt_q <= '0; + popcount_q <= '0; + vfirst_count_q <= '0; + delta_elm_q <= '0; + in_ready_threshold_q <= '0; + in_m_ready_threshold_q <= '0; + out_valid_threshold_q <= '0; + viota_acc_q <= '0; + found_one_q <= '0; + be_viota_seq_q <= '1; // Default: write end else begin - vinsn_running_q <= vinsn_running_d; - read_cnt_q <= read_cnt_d; - issue_cnt_q <= issue_cnt_d; - commit_cnt_q <= commit_cnt_d; - vrf_pnt_q <= vrf_pnt_d; - mask_pnt_q <= mask_pnt_d; - pe_resp_o <= pe_resp; - result_final_gnt_q <= result_final_gnt_d; - vcpop_slice_cnt_q <= vcpop_slice_cnt_d; - popcount_q <= popcount_d; - vfirst_count_q <= vfirst_count_d; + vinsn_running_q <= vinsn_running_d; + read_cnt_q <= read_cnt_d; + issue_cnt_q <= issue_cnt_d; + processing_cnt_q <= processing_cnt_d; + commit_cnt_q <= commit_cnt_d; + vrf_pnt_q <= vrf_pnt_d; + mask_pnt_q <= mask_pnt_d; + pe_resp_o <= pe_resp; + result_final_gnt_q <= result_final_gnt_d; + popcount_q <= popcount_d; + vfirst_count_q <= vfirst_count_d; + delta_elm_q <= delta_elm_d; + in_ready_threshold_q <= in_ready_threshold_d; + in_m_ready_threshold_q <= in_m_ready_threshold_d; + out_valid_threshold_q <= out_valid_threshold_d; + viota_acc_q <= viota_acc_d; + found_one_q <= found_one_d; + be_viota_seq_q <= be_viota_seq_d; end end diff --git a/hardware/src/masku/masku_operands.sv b/hardware/src/masku/masku_operands.sv index 503f5b207..2788652c7 100644 --- a/hardware/src/masku/masku_operands.sv +++ b/hardware/src/masku/masku_operands.sv @@ -13,13 +13,17 @@ // // // Incoming Operands: -// masku_operands_i = {v0.m, vs2, alu_result, fpu_result} +// masku_operands_i = {v0.m, vd, alu_result, fpu_result} // module masku_operands import ara_pkg::*; import rvv_pkg::*; #( parameter int unsigned NrLanes = 0, parameter type pe_req_t = logic, - parameter type pe_resp_t = logic + parameter type pe_resp_t = logic, + // Vl bit mask disabled by default since we fetch vd from opqueues + // to provide tail undisturbed policy at bit granularity. + // Enable this if the datapath is changed and vd is no more fetched. + localparam int unsigned VlBitMaskEnable = 0 ) ( input logic clk_i, input logic rst_ni, @@ -41,20 +45,20 @@ module masku_operands import ara_pkg::*; import rvv_pkg::*; #( output logic [NrLanes*ELEN-1:0] masku_operand_alu_seq_o, // ALU/FPU result (deshuffled, uncompressed) output logic [ NrLanes-1:0] masku_operand_alu_seq_valid_o, input logic [ NrLanes-1:0] masku_operand_alu_seq_ready_i, - output elen_t [ NrLanes-1:0] masku_operand_vs2_o, // vs2 (shuffled) - output logic [ NrLanes-1:0] masku_operand_vs2_valid_o, - input logic [ NrLanes-1:0] masku_operand_vs2_ready_i, - output logic [NrLanes*ELEN-1:0] masku_operand_vs2_seq_o, // vs2 (deshuffled) - output logic [ NrLanes-1:0] masku_operand_vs2_seq_valid_o, - input logic [ NrLanes-1:0] masku_operand_vs2_seq_ready_i, + output elen_t [ NrLanes-1:0] masku_operand_vd_o, // vd (shuffled) + output logic [ NrLanes-1:0] masku_operand_vd_valid_o, + input logic [ NrLanes-1:0] masku_operand_vd_ready_i, + output logic [NrLanes*ELEN-1:0] masku_operand_vd_seq_o, // vd (deshuffled) + output logic [ NrLanes-1:0] masku_operand_vd_seq_valid_o, + input logic [ NrLanes-1:0] masku_operand_vd_seq_ready_i, output elen_t [ NrLanes-1:0] masku_operand_m_o, // Mask (shuffled) output logic [ NrLanes-1:0] masku_operand_m_valid_o, input logic [ NrLanes-1:0] masku_operand_m_ready_i, output logic [NrLanes*ELEN-1:0] masku_operand_m_seq_o, // Mask (deshuffled) output logic [ NrLanes-1:0] masku_operand_m_seq_valid_o, input logic [ NrLanes-1:0] masku_operand_m_seq_ready_i, - output logic [NrLanes*ELEN-1:0] bit_enable_mask_o, // Bit mask for mask unit instructions (shuffled like mask register) - output logic [NrLanes*ELEN-1:0] alu_result_compressed_o // ALU/FPU results compressed (from sew to 1-bit) (shuffled, in mask format) + output logic [NrLanes*ELEN-1:0] bit_enable_mask_o, // Bit mask for mask unit instructions (shuffled like mask register) + output logic [NrLanes*ELEN-1:0] alu_result_compressed_seq_o // ALU/FPU results compressed (from sew to 1-bit) (deshuffled, in mask format) ); // Imports @@ -62,75 +66,101 @@ module masku_operands import ara_pkg::*; import rvv_pkg::*; #( // Local Parameter localparam int unsigned DATAPATH_WIDTH = NrLanes * ELEN; // Mask Unit datapath width - localparam int unsigned ELEN_BYTES = ELEN / 8; // Helper signals logic [DATAPATH_WIDTH-1:0] deshuffled_vl_bit_mask; // this bit enable signal is only dependent on vl logic [DATAPATH_WIDTH-1:0] shuffled_vl_bit_mask; // this bit enable signal is only dependent on vl vew_e bit_enable_shuffle_eew; - elen_t [NrLanes-1:0] masku_operand_vs2_d; - logic masku_operand_vs2_lane_valid; - logic masku_operand_vs2_lane_ready; - logic masku_operand_vs2_spill_valid; - logic masku_operand_vs2_spill_ready; + elen_t [NrLanes-1:0] masku_operand_vd_d; + logic [NrLanes-1:0] masku_operand_vd_lane_valid; + logic [NrLanes-1:0] masku_operand_vd_lane_ready; + logic [NrLanes-1:0] masku_operand_vd_spill_valid; + logic [NrLanes-1:0] masku_operand_vd_spill_ready; + elen_t [NrLanes-1:0] masku_operand_m_d; + logic [NrLanes-1:0] masku_operand_m_lane_valid; + logic [NrLanes-1:0] masku_operand_m_lane_ready; + logic [NrLanes-1:0] masku_operand_m_spill_valid; + logic [NrLanes-1:0] masku_operand_m_spill_ready; // Extract operands from input (input comes in "shuffled form" from the lanes) for (genvar lane = 0; lane < NrLanes; lane++) begin - assign masku_operand_m_o[lane] = masku_operands_i[lane][0]; - assign masku_operand_vs2_d[lane] = masku_operands_i[lane][1]; assign masku_operand_alu_o[lane] = masku_operands_i[lane][2 + masku_fu_i]; + assign masku_operand_vd_d[lane] = masku_operands_i[lane][1]; + assign masku_operand_m_d[lane] = masku_operands_i[lane][0]; end // ---------- - // Deshuffle vs2 + // Deshuffle input sources // ---------- always_comb begin masku_operand_m_seq_o = '0; - masku_operand_vs2_seq_o = '0; + masku_operand_vd_seq_o = '0; masku_operand_alu_seq_o = '0; - for (int b = 0; b < (NrLanes * ELEN_BYTES); b++) begin - automatic int deshuffle_idx = deshuffle_index(b, NrLanes, vinsn_issue_i.vtype.vsew); + for (int b = 0; b < (NrLanes * ELENB); b++) begin + automatic int deshuffle_alu_idx = deshuffle_index(b, NrLanes, vinsn_issue_i.eew_vs2); + automatic int deshuffle_vd_idx = deshuffle_index(b, NrLanes, vinsn_issue_i.eew_vd_op); automatic int deshuffle_m_idx = deshuffle_index(b, NrLanes, vinsn_issue_i.eew_vmask); - automatic int lane_idx = b / ELEN_BYTES; // rounded down to nearest integer - automatic int lane_offset = b % ELEN_BYTES; - masku_operand_alu_seq_o[8*deshuffle_idx +: 8] = masku_operand_alu_o[lane_idx][8*lane_offset +: 8]; - masku_operand_vs2_seq_o[8*deshuffle_idx +: 8] = masku_operand_vs2_o[lane_idx][8*lane_offset +: 8]; + automatic int lane_idx = b / ELENB; // rounded down to nearest integer + automatic int lane_offset = b % ELENB; + masku_operand_alu_seq_o[8*deshuffle_alu_idx +: 8] = masku_operand_alu_o[lane_idx][8*lane_offset +: 8]; + masku_operand_vd_seq_o[8*deshuffle_vd_idx +: 8] = masku_operand_vd_o[lane_idx][8*lane_offset +: 8]; masku_operand_m_seq_o[8*deshuffle_m_idx +: 8] = masku_operand_m_o[lane_idx][8*lane_offset +: 8]; end end always_comb begin - masku_operand_vs2_spill_ready = 1'b1; + masku_operand_vd_spill_ready = 1'b0; + masku_operand_m_spill_ready = 1'b0; for (int lane = 0; lane < NrLanes; lane++) begin - masku_operand_vs2_spill_ready &= masku_operand_vs2_ready_i[lane] | masku_operand_vs2_seq_ready_i[lane]; + masku_operand_vd_spill_ready[lane] = masku_operand_vd_ready_i[lane] | masku_operand_vd_seq_ready_i[lane]; + masku_operand_m_spill_ready[lane] = masku_operand_m_ready_i[lane] | masku_operand_m_seq_ready_i[lane]; end end - spill_register #( - .T ( elen_t [NrLanes-1:0] ), - .Bypass ( 1'b0 ) - ) i_spill_register_vs2 ( - .clk_i (clk_i), - .rst_ni (rst_ni), - .valid_i (masku_operand_vs2_lane_valid), - .ready_o (masku_operand_vs2_lane_ready), - .data_i (masku_operand_vs2_d), - .valid_o (masku_operand_vs2_spill_valid), - .ready_i (masku_operand_vs2_spill_ready), - .data_o (masku_operand_vs2_o) - ); + for (genvar lane = 0; lane < NrLanes; lane++) begin : gen_masku_operands_spill_regs + spill_register #( + .T ( elen_t ) + ) i_spill_register_vd ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .valid_i (masku_operand_vd_lane_valid[lane]), + .ready_o (masku_operand_vd_lane_ready[lane]), + .data_i (masku_operand_vd_d[lane]), + .valid_o (masku_operand_vd_spill_valid[lane]), + .ready_i (masku_operand_vd_spill_ready[lane]), + .data_o (masku_operand_vd_o[lane]) + ); + + spill_register #( + .T ( elen_t ) + ) i_spill_register_m ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .valid_i (masku_operand_m_lane_valid[lane]), + .ready_o (masku_operand_m_lane_ready[lane]), + .data_i (masku_operand_m_d[lane]), + .valid_o (masku_operand_m_spill_valid[lane]), + .ready_i (masku_operand_m_spill_ready[lane]), + .data_o (masku_operand_m_o[lane]) + ); + end for (genvar lane = 0; lane < NrLanes; lane++) begin - assign masku_operand_vs2_valid_o[lane] = masku_operand_vs2_spill_valid; - assign masku_operand_vs2_seq_valid_o[lane] = masku_operand_vs2_spill_valid; + assign masku_operand_vd_valid_o[lane] = masku_operand_vd_spill_valid[lane]; + assign masku_operand_vd_seq_valid_o[lane] = masku_operand_vd_spill_valid[lane]; + + assign masku_operand_m_valid_o[lane] = masku_operand_m_spill_valid[lane]; + assign masku_operand_m_seq_valid_o[lane] = masku_operand_m_spill_valid[lane]; end always_comb begin - masku_operand_vs2_lane_valid = 1'b1; + masku_operand_vd_lane_valid = 1'b0; + masku_operand_m_lane_valid = 1'b0; for (int lane = 0; lane < NrLanes; lane++) begin - masku_operand_vs2_lane_valid &= masku_operand_valid_i[lane][1]; + masku_operand_vd_lane_valid[lane] = masku_operand_valid_i[lane][1]; + masku_operand_m_lane_valid[lane] = masku_operand_valid_i[lane][0]; end end @@ -139,7 +169,7 @@ module masku_operands import ara_pkg::*; import rvv_pkg::*; #( // ------------------------------------------------ // Generate shuffled bit level mask - assign bit_enable_shuffle_eew = vinsn_issue_i.op inside {[VMFEQ:VMSGTU], [VMSGT:VMSBC]} ? vinsn_issue_i.vtype.vsew : vinsn_issue_i.eew_vd_op; + assign bit_enable_shuffle_eew = vinsn_issue_i.op inside {[VMFEQ:VMSBC]} ? vinsn_issue_i.vtype.vsew : vinsn_issue_i.eew_vd_op; always_comb begin // Default assignments @@ -148,32 +178,39 @@ module masku_operands import ara_pkg::*; import rvv_pkg::*; #( bit_enable_mask_o = '0; // Generate deshuffled vl bit mask - for (int unsigned i = 0; i < DATAPATH_WIDTH; i++) begin - if (i < vinsn_issue_i.vl) begin - deshuffled_vl_bit_mask[i] = 1'b1; + if (VlBitMaskEnable) begin + for (int unsigned i = 0; i < DATAPATH_WIDTH; i++) begin + if (i < vinsn_issue_i.vl) begin + deshuffled_vl_bit_mask[i] = 1'b1; + end end end - for (int unsigned b = 0; b < NrLanes * ELEN_BYTES; b++) begin + for (int unsigned b = 0; b < NrLanes * ELENB; b++) begin // local helper signals logic [idx_width(DATAPATH_WIDTH)-1:0] src_operand_byte_shuffle_index; logic [idx_width(DATAPATH_WIDTH)-1:0] mask_operand_byte_shuffle_index; logic [ idx_width(NrLanes)-1:0] mask_operand_byte_shuffle_lane_index; - logic [ idx_width(ELEN_BYTES)-1:0] mask_operand_byte_shuffle_lane_offset; + logic [ idx_width(ELENB)-1:0] mask_operand_byte_shuffle_lane_offset; // get shuffle idices // Note: two types of shuffle indices are needed because the source operand and the // mask register might not have the same effective element width (eew) src_operand_byte_shuffle_index = shuffle_index(b, NrLanes, bit_enable_shuffle_eew); mask_operand_byte_shuffle_index = shuffle_index(b, NrLanes, vinsn_issue_i.eew_vmask); - mask_operand_byte_shuffle_lane_index = mask_operand_byte_shuffle_index[idx_width(ELEN_BYTES) +: idx_width(NrLanes)]; - mask_operand_byte_shuffle_lane_offset = mask_operand_byte_shuffle_index[idx_width(ELEN_BYTES)-1:0]; + mask_operand_byte_shuffle_lane_index = mask_operand_byte_shuffle_index[idx_width(ELENB) +: idx_width(NrLanes)]; + mask_operand_byte_shuffle_lane_offset = mask_operand_byte_shuffle_index[idx_width(ELENB)-1:0]; // shuffle bit enable - shuffled_vl_bit_mask[8*src_operand_byte_shuffle_index +: 8] = deshuffled_vl_bit_mask[8*b +: 8]; + if (VlBitMaskEnable) begin + shuffled_vl_bit_mask[8*src_operand_byte_shuffle_index +: 8] = deshuffled_vl_bit_mask[8*b +: 8]; + // Generate bit-level mask + bit_enable_mask_o[8*src_operand_byte_shuffle_index +: 8] = shuffled_vl_bit_mask[8*src_operand_byte_shuffle_index +: 8]; + end else begin + shuffled_vl_bit_mask = '0; + bit_enable_mask_o = '0; + end - // Generate bit-level mask - bit_enable_mask_o[8*src_operand_byte_shuffle_index +: 8] = shuffled_vl_bit_mask[8*src_operand_byte_shuffle_index +: 8]; if (!vinsn_issue_i.vm && !(vinsn_issue_i.op inside {VMADC, VMSBC})) begin // exception for VMADC and VMSBC, because they use the mask register as a source operand (and not as a mask) bit_enable_mask_o[8*src_operand_byte_shuffle_index +: 8] &= masku_operand_m_o[mask_operand_byte_shuffle_lane_index][8*mask_operand_byte_shuffle_lane_offset +: 8]; end @@ -184,30 +221,24 @@ module masku_operands import ara_pkg::*; import rvv_pkg::*; #( // Compress ALU/FPU results into a mask vector // ------------------------------------------- always_comb begin - alu_result_compressed_o = '0; - for (int b = 0; b < ELEN_BYTES * NrLanes; b++) begin - if ((b % (1 << vinsn_issue_i.vtype.vsew)) == '0) begin - automatic int src_byte = shuffle_index(b, NrLanes, vinsn_issue_i.vtype.vsew); - automatic int src_byte_lane = src_byte[idx_width(ELEN_BYTES) +: idx_width(NrLanes)]; - automatic int src_byte_offset = src_byte[idx_width(ELEN_BYTES)-1:0]; - - automatic int dest_bit_seq = (b >> vinsn_issue_i.vtype.vsew) + vrf_pnt_i; - automatic int dest_byte_seq = dest_bit_seq / ELEN_BYTES; - automatic int dest_byte = shuffle_index(dest_byte_seq, NrLanes, vinsn_issue_i.vtype.vsew); - alu_result_compressed_o[ELEN_BYTES * dest_byte + dest_bit_seq[idx_width(ELEN_BYTES)-1:0]] = masku_operand_alu_o[src_byte_lane][8 * src_byte_offset]; + alu_result_compressed_seq_o = '1; + for (int b = 0; b < ELENB * NrLanes; b++) begin + if ((b % (1 << vinsn_issue_i.eew_vs2)) == '0) begin + automatic int src_byte = shuffle_index(b, NrLanes, vinsn_issue_i.eew_vs2); + automatic int src_byte_lane = src_byte[idx_width(ELENB) +: idx_width(NrLanes)]; + automatic int src_byte_offset = src_byte[idx_width(ELENB)-1:0]; + + automatic int dest_bit_seq = (b >> vinsn_issue_i.eew_vs2) + vrf_pnt_i; + automatic int dest_byte_seq = dest_bit_seq / ELENB; + alu_result_compressed_seq_o[ELENB * dest_byte_seq + dest_bit_seq[idx_width(ELENB)-1:0]] = masku_operand_alu_o[src_byte_lane][8 * src_byte_offset]; end end end - // Control for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_unpack_masku_operands // immediately acknowledge operands coming from functional units assign masku_operand_alu_valid_o[lane] = masku_operand_valid_i[lane][2 + masku_fu_i]; - - assign masku_operand_m_valid_o[lane] = masku_operand_valid_i[lane][0]; - - assign masku_operand_m_seq_valid_o[lane] = masku_operand_valid_i[lane][0]; end: gen_unpack_masku_operands @@ -220,10 +251,10 @@ module masku_operands import ara_pkg::*; import rvv_pkg::*; #( for (int operand_fu = 0; operand_fu < NrMaskFUnits; operand_fu++) begin masku_operand_ready_o[lane][2 + operand_fu] = (masku_fu_e'(operand_fu) == masku_fu_i) && masku_operand_alu_ready_i[lane]; end - // Acknowledge vs2 operands - masku_operand_ready_o[lane][1] = masku_operand_vs2_lane_ready; + // Acknowledge vd operands + masku_operand_ready_o[lane][1] = masku_operand_vd_lane_ready[lane]; // Acknowledge mask operand - masku_operand_ready_o[lane][0] = masku_operand_m_ready_i[lane]; + masku_operand_ready_o[lane][0] = masku_operand_m_lane_ready[lane]; end end From 215edf798f31e58376f8dad41109de37441b8a64 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Fri, 15 Nov 2024 17:32:53 +0100 Subject: [PATCH 7/8] [hardware] :bug: Fix legality check in dispatcher --- hardware/src/ara_dispatcher.sv | 75 +++++++++++++++------------------- 1 file changed, 34 insertions(+), 41 deletions(-) diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index 1915c47db..c19efa2a1 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -224,7 +224,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( logic load_zero_vl, store_zero_vl; // Do not checks vregs validity against current LMUL logic skip_lmul_checks; - logic skip_vs1_lmul_checks; // Are we decoding? logic is_decoding; // Is this an in-lane operation? @@ -333,7 +332,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( store_zero_vl = 1'b0; skip_lmul_checks = 1'b0; - skip_vs1_lmul_checks = 1'b0; null_vslideup = 1'b0; @@ -1522,7 +1520,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b010010: begin // VXUNARY0 // These instructions do not use vs1 ara_req.use_vs1 = 1'b0; - skip_vs1_lmul_checks = 1'b1; // They are always encoded as ADDs with zero. ara_req.op = ara_pkg::VADD; ara_req.use_scalar_op = 1'b1; @@ -1750,21 +1747,21 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // destination register. if (!skip_lmul_checks) begin unique case (ara_req.emul) - LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vd; + LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vd; + LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vd; default:; endcase unique case (lmul_vs2) - LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vs2; + LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vs2; + LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vs2; default:; endcase unique case (lmul_vs1) - LMUL_2: if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4: if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8: if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2: if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vs1; + LMUL_4: if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vs1; + LMUL_8: if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vs1; default:; endcase end @@ -1992,15 +1989,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // destination register. if (!skip_lmul_checks) begin unique case (ara_req.emul) - LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vd; + LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vd; + LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vd; default:; endcase unique case (lmul_vs2) - LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vs2; + LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vs2; + LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vs2; default:; endcase end @@ -2146,7 +2143,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b010010: begin // VFUNARY0 // These instructions do not use vs1 ara_req.use_vs1 = 1'b0; - skip_vs1_lmul_checks = 1'b1; case (insn.varith_type.rs1) 5'b00000: ara_req.op = VFCVTXUF; @@ -2253,7 +2249,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b010011: begin // VFUNARY1 // These instructions do not use vs1 ara_req.use_vs1 = 1'b0; - skip_vs1_lmul_checks = 1'b1; unique case (insn.varith_type.rs1) 5'b00000: ara_req.op = ara_pkg::VFSQRT; @@ -2411,28 +2406,26 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // destination register. if (!skip_lmul_checks) begin unique case (ara_req.emul) - LMUL_2 : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4 : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8 : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2 : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vd; + LMUL_4 : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vd; + LMUL_8 : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vd; LMUL_RSVD: illegal_insn = 1'b1; default:; endcase unique case (lmul_vs2) - LMUL_2 : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4 : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8 : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2 : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vs2; + LMUL_4 : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vs2; + LMUL_8 : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vs2; + LMUL_RSVD: illegal_insn = 1'b1; + default:; + endcase + unique case (lmul_vs1) + LMUL_2 : if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vs1; + LMUL_4 : if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vs1; + LMUL_8 : if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vs1; LMUL_RSVD: illegal_insn = 1'b1; default:; endcase - if (!skip_vs1_lmul_checks) begin - unique case (lmul_vs1) - LMUL_2 : if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4 : if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8 : if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = 1'b1; - LMUL_RSVD: illegal_insn = 1'b1; - default:; - endcase - end end // Ara can support 16-bit float, 32-bit float, 64-bit float. @@ -2705,16 +2698,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // destination register. if (!skip_lmul_checks) begin unique case (ara_req.emul) - LMUL_2 : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4 : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8 : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2 : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vd; + LMUL_4 : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vd; + LMUL_8 : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vd; LMUL_RSVD: illegal_insn = 1'b1; default:; endcase unique case (lmul_vs2) - LMUL_2 : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4 : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8 : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2 : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vs2; + LMUL_4 : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vs2; + LMUL_8 : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vs2; LMUL_RSVD: illegal_insn = 1'b1; default:; endcase From 634a7534047b9e1fcad5a54cba47eb6c7c84d911 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Mon, 25 Nov 2024 14:57:46 +0100 Subject: [PATCH 8/8] [CHANGELOG] Update Changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 919277f9c..36a4bb1fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Bump upload and delete artifact actions - Fix synthesis-unfriendly constructs - Fix vector slicing bug in operand requesters + - Fix legality check for allowed registers in dispatcher + - Remove a couple of latches ### Added @@ -39,6 +41,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Add multi-precision conv3d - Add support for unit-stride, non-unit-stride, indexed segment memory instructions - Add support for fault-only-first loads + - Extend the riscv-tests MASKU-related tests ### Changed @@ -69,6 +72,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Refactor MASKU - Remove bit-support for tail elements - Adapt mask tests to this behavior + - Refactor the MASKU + - The MASKU always receives balanced payloads from the lanes + - Remove FPU support for opqueues that do not need it ## 3.0.0 - 2023-09-08