diff --git a/Bender.lock b/Bender.lock index 0458a760d..e3514dcd2 100644 --- a/Bender.lock +++ b/Bender.lock @@ -30,10 +30,10 @@ packages: Git: https://github.com/pulp-platform/common_verification.git dependencies: [] cva6: - revision: ee89dcc00e6c1a1f4cf97ee1835e950fcfdeebb5 + revision: ea86d0ac5fe23ac7889cf8a8b8df7a8c0813bfad version: null source: - Git: https://github.com/pulp-platform/cva6.git + Git: https://github.com/mp-17/cva6.git dependencies: - axi - common_cells diff --git a/Bender.yml b/Bender.yml index 142dcd8d0..8bcf21ee4 100644 --- a/Bender.yml +++ b/Bender.yml @@ -10,7 +10,7 @@ package: dependencies: axi: { git: "https://github.com/pulp-platform/axi.git", version: 0.39.1 } common_cells: { git: "https://github.com/pulp-platform/common_cells.git", version: 1.22.1 } - cva6: { git: "https://github.com/pulp-platform/cva6.git", rev: ee89dcc00e6c1a1f4cf97ee1835e950fcfdeebb5 } # pulp-v1 + cva6: { git: "https://github.com/mp-17/cva6.git", rev: ea86d0ac5fe23ac7889cf8a8b8df7a8c0813bfad } # rebase/pulp-v1-os tech_cells_generic: { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.13 } apb: { git: "https://github.com/pulp-platform/apb.git", version: 0.2.4 } @@ -42,7 +42,7 @@ sources: - hardware/src/lane/simd_mul.sv - hardware/src/lane/vector_regfile.sv - hardware/src/lane/power_gating_generic.sv - - hardware/src/masku/masku.sv + - hardware/src/masku/masku_operands.sv - hardware/src/sldu/p2_stride_gen.sv - hardware/src/sldu/sldu_op_dp.sv - hardware/src/sldu/sldu.sv @@ -55,6 +55,7 @@ sources: - hardware/src/lane/vmfpu.sv - hardware/src/lane/fixed_p_rounding.sv - hardware/src/vlsu/vlsu.sv + - hardware/src/masku/masku.sv # Level 3 - hardware/src/lane/vector_fus_stage.sv # Level 4 diff --git a/apps/verification/Makefile b/apps/verification/Makefile new file mode 100644 index 000000000..777068a6f --- /dev/null +++ b/apps/verification/Makefile @@ -0,0 +1,46 @@ +# Copyright 2024 ETH Zurich and University of Bologna. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Matteo Perotti, ETH Zurich + +# Variables for sequence length and number of sequences +SEQ_LENGTH ?= 6 +NUM_SEQS ?= 10 + +# Python +PYTHON ?= python3 + +# Directories +SRC_DIR = src +SCRIPT_DIR = script +OUTPUT_DIR = ../rand_seq_autogen + +# Source files +INSTRUCTIONS_FILE = $(SRC_DIR)/vinsn_list.txt +PYTHON_SCRIPT = $(SCRIPT_DIR)/vinsn_trace_gen.py +# Output files +OUTPUT_SEQ_FILE = $(OUTPUT_DIR)/vinsn_rand_seq.S +OUTPUT_MAIN_FILE = $(OUTPUT_DIR)/main.c + +# Target to generate the sequences and main file +all: $(OUTPUT_DIR) $(OUTPUT_SEQ_FILE) $(OUTPUT_MAIN_FILE) + +# Target to create the output directory +$(OUTPUT_DIR): + mkdir -p $(OUTPUT_DIR) + +$(OUTPUT_SEQ_FILE) $(OUTPUT_MAIN_FILE): $(INSTRUCTIONS_FILE) $(PYTHON_SCRIPT) + $(PYTHON) $(PYTHON_SCRIPT) $(INSTRUCTIONS_FILE) $(OUTPUT_SEQ_FILE) $(SEQ_LENGTH) $(NUM_SEQS) $(OUTPUT_MAIN_FILE) + +.PHONY: all clean diff --git a/apps/verification/README.md b/apps/verification/README.md new file mode 100644 index 000000000..54ceb92e5 --- /dev/null +++ b/apps/verification/README.md @@ -0,0 +1,10 @@ +# Usage +To generate the main.c and vinsn_rand_seq.S files with specific sequence length and number of sequences, run: + +```bash +make SEQ_LENGTH=6 NUM_SEQS=10 +``` + +This will create the output directory in the parent directory and place the main.c and vinsn_rand_seq.S files inside it. +The SEQ_LENGTH and NUM_SEQS variables can be adjusted as needed when running the make command. +The generated files will include comments at the beginning indicating they were auto-generated by the Python script. \ No newline at end of file diff --git a/apps/verification/script/README.md b/apps/verification/script/README.md new file mode 100644 index 000000000..3b3768cc3 --- /dev/null +++ b/apps/verification/script/README.md @@ -0,0 +1,11 @@ +Execute the script with the following command: + +```bash +python vinsn_trace_gen.py instructions.txt rand_seq.S 6 10 main.c +``` + + - instructions.txt is the input file with the list of instructions. + - rand_seq.S is the output file where the random sequences will be written. + - 6 is the length of each random sequence (including the initial vsetvli instruction). + - 10 is the number of random sequences to generate. + - main.c is the file where the main function and function declarations will be written. \ No newline at end of file diff --git a/apps/verification/script/vinsn_trace_gen.py b/apps/verification/script/vinsn_trace_gen.py new file mode 100644 index 000000000..8a3636480 --- /dev/null +++ b/apps/verification/script/vinsn_trace_gen.py @@ -0,0 +1,92 @@ +# Copyright 2024 ETH Zurich and University of Bologna. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Matteo Perotti, ETH Zurich + +import random +import sys + +def load_instructions(file_path): + with open(file_path, 'r') as file: + instructions = [line.strip() for line in file if line.strip() and not line.strip().startswith('#')] + return instructions + +def generate_random_sequences(instructions, sequence_length, num_sequences): + sequences = [] + for _ in range(num_sequences): + XX = random.choice([8, 16, 32, 64]) + Y = random.choice([1, 2, 4, 8]) + initial_instruction = f'vsetvli t0, x0, e{XX}, m{Y}, ta, ma' + sequence = [initial_instruction] + random.choices(instructions, k=sequence_length - 1) + sequences.append(sequence) + return sequences + +def write_sequences_to_file(sequences, output_file_path, script_name): + with open(output_file_path, 'w') as file: + file.write(f'# This file was auto-generated by {script_name}\n') + file.write('.text\n') + for i in range(len(sequences)): + file.write(f'.global rand_seq_{i}\n') + file.write('\n') + + for i, sequence in enumerate(sequences): + file.write(f'rand_seq_{i}:\n') + for instruction in sequence: + file.write(f' {instruction}\n') + file.write(' ret\n\n') + +def write_main_file(num_sequences, output_file_path, script_name): + with open(output_file_path, 'w') as file: + file.write(f'// This file was auto-generated by {script_name}\n') + file.write('#include \n') + file.write('#include \n\n') + file.write('#ifndef SPIKE\n') + file.write('#include "printf.h"\n') + file.write('#else\n') + file.write('#include "util.h"\n') + file.write('#include \n') + file.write('#endif\n\n') + + for i in range(num_sequences): + file.write(f'void rand_seq_{i}();\n') + file.write('\n') + file.write('int main() {\n') + for i in range(num_sequences): + file.write(f' printf("Rand Seq {i}\\n");\n') + file.write(f' rand_seq_{i}();\n\n') + file.write(' printf("Program end\\n");\n\n') + file.write(' return 0;\n') + file.write('}\n') + +def main(): + if len(sys.argv) != 6: + print("Usage: python generate_sequences.py ") + sys.exit(1) + + input_file_path = sys.argv[1] + output_seq_file_path = sys.argv[2] + sequence_length = int(sys.argv[3]) + num_sequences = int(sys.argv[4]) + output_main_file_path = sys.argv[5] + + instructions = load_instructions(input_file_path) + sequences = generate_random_sequences(instructions, sequence_length, num_sequences) + script_name = sys.argv[0] + write_sequences_to_file(sequences, output_seq_file_path, script_name) + write_main_file(num_sequences, output_main_file_path, script_name) + print(f'{num_sequences} random sequences of length {sequence_length} have been written to {output_seq_file_path}') + print(f'Main file written to {output_main_file_path}') + +if __name__ == '__main__': + main() diff --git a/apps/verification/src/vinsn_list.txt b/apps/verification/src/vinsn_list.txt new file mode 100644 index 000000000..5f7f304e2 --- /dev/null +++ b/apps/verification/src/vinsn_list.txt @@ -0,0 +1,575 @@ +# Author: Camel Coder, camel-cdr, + +vadd.vv v8,v16,v24 +vadd.vv v8,v16,v24,v0.t +vadd.vx v8,v16,t0 +vadd.vx v8,v16,t0,v0.t +vadd.vi v8,v16,13 +vadd.vi v8,v16,13,v0.t +vsub.vv v8,v16,v24 +vsub.vv v8,v16,v24,v0.t +vsub.vx v8,v16,t0 +vsub.vx v8,v16,t0,v0.t +vrsub.vx v8,v16,t0 +vrsub.vx v8,v16,t0,v0.t +vrsub.vi v8,v16,13 +vrsub.vi v8,v16,13,v0.t +vminu.vv v8,v16,v24 +vminu.vv v8,v16,v24,v0.t +vminu.vx v8,v16,t0 +vminu.vx v8,v16,t0,v0.t +vmin.vv v8,v16,v24 +vmin.vv v8,v16,v24,v0.t +vmin.vx v8,v16,t0 +vmin.vx v8,v16,t0,v0.t +vmaxu.vv v8,v16,v24 +vmaxu.vv v8,v16,v24,v0.t +vmaxu.vx v8,v16,t0 +vmaxu.vx v8,v16,t0,v0.t +vmax.vv v8,v16,v24 +vmax.vv v8,v16,v24,v0.t +vmax.vx v8,v16,t0 +vmax.vx v8,v16,t0,v0.t +vand.vv v8,v16,v24 +vand.vv v8,v16,v24,v0.t +vand.vx v8,v16,t0 +vand.vx v8,v16,t0,v0.t +vand.vi v8,v16,13 +vand.vi v8,v16,13,v0.t +vor.vv v8,v16,v24 +vor.vv v8,v16,v24,v0.t +vor.vx v8,v16,t0 +vor.vx v8,v16,t0,v0.t +vor.vi v8,v16,13 +vor.vi v8,v16,13,v0.t +vxor.vv v8,v16,v24 +vxor.vv v8,v16,v24,v0.t +vxor.vx v8,v16,t0 +vxor.vx v8,v16,t0,v0.t +vxor.vi v8,v16,13 +vxor.vi v8,v16,13,v0.t +vrgather.vv v8,v16,v24 +vrgather.vv v8,v16,v24,v0.t +vrgather.vx v8,v16,t0 +vrgather.vx v8,v16,t0,v0.t +vrgather.vi v8,v16,3 +vrgather.vi v8,v16,3,v0.t +vslideup.vx v8,v16,t0 +vslideup.vx v8,v16,t0,v0.t +vslideup.vi v8,v16,3 +vslideup.vi v8,v16,3,v0.t +vrgatherei16.vv v8,v16,v24 +vrgatherei16.vv v8,v16,v24,v0.t +vslidedown.vx v8,v16,t0 +vslidedown.vx v8,v16,t0,v0.t +vslidedown.vi v8,v16,3 +vslidedown.vi v8,v16,3,v0.t +vredsum.vs v8,v16,v24 +vredsum.vs v8,v16,v24,v0.t +vredand.vs v8,v16,v24 +vredand.vs v8,v16,v24,v0.t +vredor.vs v8,v16,v24 +vredor.vs v8,v16,v24,v0.t +vredxor.vs v8,v16,v24 +vredxor.vs v8,v16,v24,v0.t +vredminu.vs v8,v16,v24 +vredminu.vs v8,v16,v24,v0.t +vredmin.vs v8,v16,v24 +vredmin.vs v8,v16,v24,v0.t +vredmaxu.vs v8,v16,v24 +vredmaxu.vs v8,v16,v24,v0.t +vredmax.vs v8,v16,v24 +vredmax.vs v8,v16,v24,v0.t +vaaddu.vv v8,v16,v24 +vaaddu.vv v8,v16,v24,v0.t +vaaddu.vx v8,v16,t0 +vaaddu.vx v8,v16,t0,v0.t +vaadd.vv v8,v16,v24 +vaadd.vv v8,v16,v24,v0.t +vaadd.vx v8,v16,t0 +vaadd.vx v8,v16,t0,v0.t +vasubu.vv v8,v16,v24 +vasubu.vv v8,v16,v24,v0.t +vasubu.vx v8,v16,t0 +vasubu.vx v8,v16,t0,v0.t +vasub.vv v8,v16,v24 +vasub.vv v8,v16,v24,v0.t +vasub.vx v8,v16,t0 +vasub.vx v8,v16,t0,v0.t +vslide1up.vx v8,v16,t0 +vslide1up.vx v8,v16,t0,v0.t +vslide1down.vx v8,v16,t0 +vslide1down.vx v8,v16,t0,v0.t +vadc.vvm v8,v16,v24,v0 +vadc.vxm v8,v16,t0,v0 +vadc.vim v8,v16,13,v0 +vmadc.vvm v8,v16,v24,v0 +vmadc.vxm v8,v16,t0,v0 +vmadc.vim v8,v16,13,v0 +vsbc.vvm v8,v16,v24,v0 +vsbc.vxm v8,v16,t0,v0 +vmsbc.vvm v8,v16,v24,v0 +vmsbc.vxm v8,v16,t0,v0 +vmerge.vvm v8,v16,v24,v0 +vmerge.vxm v8,v16,t0,v0 +vmerge.vim v8,v16,13,v0 +vmv.v.v v8,v16 +vmv.v.x v8,t0 +vmv.v.i v8,13 +vmseq.vv v8,v16,v24 +vmseq.vv v8,v16,v24,v0.t +vmseq.vx v8,v16,t0 +vmseq.vx v8,v16,t0,v0.t +vmseq.vi v8,v16,13 +vmseq.vi v8,v16,13,v0.t +vmsne.vv v8,v16,v24 +vmsne.vv v8,v16,v24,v0.t +vmsne.vx v8,v16,t0 +vmsne.vx v8,v16,t0,v0.t +vmsne.vi v8,v16,13 +vmsne.vi v8,v16,13,v0.t +vmsltu.vv v8,v16,v24 +vmsltu.vv v8,v16,v24,v0.t +vmsltu.vx v8,v16,t0 +vmsltu.vx v8,v16,t0,v0.t +vmslt.vv v8,v16,v24 +vmslt.vv v8,v16,v24,v0.t +vmslt.vx v8,v16,t0 +vmslt.vx v8,v16,t0,v0.t +vmsleu.vv v8,v16,v24 +vmsleu.vv v8,v16,v24,v0.t +vmsleu.vx v8,v16,t0 +vmsleu.vx v8,v16,t0,v0.t +vmsleu.vi v8,v16,13 +vmsleu.vi v8,v16,13,v0.t +vmsle.vv v8,v16,v24 +vmsle.vv v8,v16,v24,v0.t +vmsle.vx v8,v16,t0 +vmsle.vx v8,v16,t0,v0.t +vmsle.vi v8,v16,13 +vmsle.vi v8,v16,13,v0.t +vmsgtu.vx v8,v16,t0 +vmsgtu.vx v8,v16,t0,v0.t +vmsgtu.vi v8,v16,13 +vmsgtu.vi v8,v16,13,v0.t +vmsgt.vx v8,v16,t0 +vmsgt.vx v8,v16,t0,v0.t +vmsgt.vi v8,v16,13 +vmsgt.vi v8,v16,13,v0.t +vcompress.vm v0,v8,v16 +vmandn.mm v0,v8,v16 +vmand.mm v0,v8,v16 +vmor.mm v0,v8,v16 +vmxor.mm v0,v8,v16 +vmorn.mm v0,v8,v16 +vmnand.mm v0,v8,v16 +vmnor.mm v0,v8,v16 +vmxnor.mm v0,v8,v16 +vsaddu.vv v8,v16,v24 +vsaddu.vv v8,v16,v24,v0.t +vsaddu.vx v8,v16,t0 +vsaddu.vx v8,v16,t0,v0.t +vsaddu.vi v8,v16,13 +vsaddu.vi v8,v16,13,v0.t +vsadd.vv v8,v16,v24 +vsadd.vv v8,v16,v24,v0.t +vsadd.vx v8,v16,t0 +vsadd.vx v8,v16,t0,v0.t +vsadd.vi v8,v16,13 +vsadd.vi v8,v16,13,v0.t +vssubu.vv v8,v16,v24 +vssubu.vv v8,v16,v24,v0.t +vssubu.vx v8,v16,t0 +vssubu.vx v8,v16,t0,v0.t +vssub.vv v8,v16,v24 +vssub.vv v8,v16,v24,v0.t +vssub.vx v8,v16,t0 +vssub.vx v8,v16,t0,v0.t +vsll.vv v8,v16,v24 +vsll.vv v8,v16,v24,v0.t +vsll.vx v8,v16,t0 +vsll.vx v8,v16,t0,v0.t +vsll.vi v8,v16,13 +vsll.vi v8,v16,13,v0.t +vsmul.vv v8,v16,v24 +vsmul.vv v8,v16,v24,v0.t +vsmul.vx v8,v16,t0 +vsmul.vx v8,v16,t0,v0.t +vmv1r.v v8,v16 +vmv2r.v v8,v16 +vmv4r.v v8,v16 +vmv8r.v v8,v16 +vsrl.vv v8,v16,v24 +vsrl.vv v8,v16,v24,v0.t +vsrl.vx v8,v16,t0 +vsrl.vx v8,v16,t0,v0.t +vsrl.vi v8,v16,13 +vsrl.vi v8,v16,13,v0.t +vsra.vv v8,v16,v24 +vsra.vv v8,v16,v24,v0.t +vsra.vx v8,v16,t0 +vsra.vx v8,v16,t0,v0.t +vsra.vi v8,v16,13 +vsra.vi v8,v16,13,v0.t +vssrl.vv v8,v16,v24 +vssrl.vv v8,v16,v24,v0.t +vssrl.vx v8,v16,t0 +vssrl.vx v8,v16,t0,v0.t +vssrl.vi v8,v16,13 +vssrl.vi v8,v16,13,v0.t +vdivu.vv v8,v16,v24 +vdivu.vv v8,v16,v24,v0.t +vdivu.vx v8,v16,t0 +vdivu.vx v8,v16,t0,v0.t +vdiv.vv v8,v16,v24 +vdiv.vv v8,v16,v24,v0.t +vdiv.vx v8,v16,t0 +vdiv.vx v8,v16,t0,v0.t +vremu.vv v8,v16,v24 +vremu.vv v8,v16,v24,v0.t +vremu.vx v8,v16,t0 +vremu.vx v8,v16,t0,v0.t +vrem.vv v8,v16,v24 +vrem.vv v8,v16,v24,v0.t +vrem.vx v8,v16,t0 +vrem.vx v8,v16,t0,v0.t +vmulhu.vv v8,v16,v24 +vmulhu.vv v8,v16,v24,v0.t +vmulhu.vx v8,v16,t0 +vmulhu.vx v8,v16,t0,v0.t +vmul.vv v8,v16,v24 +vmul.vv v8,v16,v24,v0.t +vmul.vx v8,v16,t0 +vmul.vx v8,v16,t0,v0.t +vmulhsu.vv v8,v16,v24 +vmulhsu.vv v8,v16,v24,v0.t +vmulhsu.vx v8,v16,t0 +vmulhsu.vx v8,v16,t0,v0.t +vmulh.vv v8,v16,v24 +vmulh.vv v8,v16,v24,v0.t +vmulh.vx v8,v16,t0 +vmulh.vx v8,v16,t0,v0.t +vmadd.vv v8,v16,v24 +vmadd.vv v8,v16,v24,v0.t +vmadd.vx v8,t0,v16 +vmadd.vx v8,t0,v16,v0.t +vmacc.vv v8,v16,v24 +vmacc.vv v8,v16,v24,v0.t +vmacc.vx v8,t0,v16 +vmacc.vx v8,t0,v16,v0.t +vnsrl.wv v8,v16,v24 +vnsrl.wv v8,v16,v24,v0.t +vnsrl.wx v8,v16,t0 +vnsrl.wx v8,v16,t0,v0.t +vnsrl.wi v8,v16,13 +vnsrl.wi v8,v16,13,v0.t +vnsra.wv v8,v16,v24 +vnsra.wv v8,v16,v24,v0.t +vnsra.wx v8,v16,t0 +vnsra.wx v8,v16,t0,v0.t +vnsra.wi v8,v16,13 +vnsra.wi v8,v16,13,v0.t +vnclipu.wv v8,v16,v24 +vnclipu.wv v8,v16,v24,v0.t +vnclipu.wx v8,v16,t0 +vnclipu.wx v8,v16,t0,v0.t +vnclipu.wi v8,v16,13 +vnclipu.wi v8,v16,13,v0.t +vnclip.wv v8,v16,v24 +vnclip.wv v8,v16,v24,v0.t +vnclip.wx v8,v16,t0 +vnclip.wx v8,v16,t0,v0.t +vnclip.wi v8,v16,13 +vnclip.wi v8,v16,13,v0.t +vnmsub.vv v8,v16,v24 +vnmsub.vv v8,v16,v24,v0.t +vnmsub.vx v8,t0,v16 +vnmsub.vx v8,t0,v16,v0.t +vnmsac.vv v8,v16,v24 +vnmsac.vv v8,v16,v24,v0.t +vnmsac.vx v8,t0,v16 +vnmsac.vx v8,t0,v16,v0.t +vwaddu.vv v8,v16,v24 +vwaddu.vv v8,v16,v24,v0.t +vwaddu.vx v8,v16,t0 +vwaddu.vx v8,v16,t0,v0.t +vwadd.vv v8,v16,v24 +vwadd.vv v8,v16,v24,v0.t +vwadd.vx v8,v16,t0 +vwadd.vx v8,v16,t0,v0.t +vwsub.vv v8,v16,v24 +vwsub.vv v8,v16,v24,v0.t +vwsub.vx v8,v16,t0 +vwsub.vx v8,v16,t0,v0.t +vwaddu.wv v8,v16,v24 +vwaddu.wv v8,v16,v24,v0.t +vwaddu.wx v8,v16,t0 +vwaddu.wx v8,v16,t0,v0.t +vwadd.wv v8,v16,v24 +vwadd.wv v8,v16,v24,v0.t +vwadd.wx v8,v16,t0 +vwadd.wx v8,v16,t0,v0.t +vwsub.wv v8,v16,v24 +vwsub.wv v8,v16,v24,v0.t +vwsub.wx v8,v16,t0 +vwsub.wx v8,v16,t0,v0.t +vwmulu.vv v8,v16,v24 +vwmulu.vv v8,v16,v24,v0.t +vwmulu.vx v8,v16,t0 +vwmulu.vx v8,v16,t0,v0.t +vwmulsu.vv v8,v16,v24 +vwmulsu.vv v8,v16,v24,v0.t +vwmulsu.vx v8,v16,t0 +vwmulsu.vx v8,v16,t0,v0.t +vwmul.vv v8,v16,v24 +vwmul.vv v8,v16,v24,v0.t +vwmul.vx v8,v16,t0 +vwmul.vx v8,v16,t0,v0.t +vwmaccu.vv v8,v16,v24 +vwmaccu.vv v8,v16,v24,v0.t +vwmaccu.vx v8,t0,v16 +vwmaccu.vx v8,t0,v16,v0.t +vwmacc.vv v8,v16,v24 +vwmacc.vv v8,v16,v24,v0.t +vwmacc.vx v8,t0,v16 +vwmacc.vx v8,t0,v16,v0.t +vwmaccsu.vv v8,v16,v24 +vwmaccsu.vv v8,v16,v24,v0.t +vwmaccsu.vx v8,t0,v16 +vwmaccsu.vx v8,t0,v16,v0.t +vwmaccus.vx v8,t0,v16 +vwmaccus.vx v8,t0,v16,v0.t +vfadd.vv v8,v16,v24 +vfadd.vv v8,v16,v24,v0.t +vfadd.vf v8,v16,ft0 +vfadd.vf v8,v16,ft0,v0.t +vfsub.vv v8,v16,v24 +vfsub.vv v8,v16,v24,v0.t +vfsub.vf v8,v16,ft0 +vfsub.vf v8,v16,ft0,v0.t +vfmin.vv v8,v16,v24 +vfmin.vv v8,v16,v24,v0.t +vfmin.vf v8,v16,ft0 +vfmin.vf v8,v16,ft0,v0.t +vfmax.vv v8,v16,v24 +vfmax.vv v8,v16,v24,v0.t +vfmax.vf v8,v16,ft0 +vfmax.vf v8,v16,ft0,v0.t +vfsgnj.vv v8,v16,v24 +vfsgnj.vv v8,v16,v24,v0.t +vfsgnj.vf v8,v16,ft0 +vfsgnj.vf v8,v16,ft0,v0.t +vfsgnjn.vv v8,v16,v24 +vfsgnjn.vv v8,v16,v24,v0.t +vfsgnjn.vf v8,v16,ft0 +vfsgnjn.vf v8,v16,ft0,v0.t +vfsgnjx.vv v8,v16,v24 +vfsgnjx.vv v8,v16,v24,v0.t +vfsgnjx.vf v8,v16,ft0 +vfsgnjx.vf v8,v16,ft0,v0.t +vfslide1up.vf v8,v16,ft0 +vfslide1up.vf v8,v16,ft0,v0.t +vfslide1down.vf v8,v16,ft0 +vfslide1down.vf v8,v16,ft0,v0.t +vfredusum.vs v8,v16,v24 +vfredusum.vs v8,v16,v24,v0.t +vfredosum.vs v8,v16,v24 +vfredosum.vs v8,v16,v24,v0.t +vfredmin.vs v8,v16,v24 +vfredmin.vs v8,v16,v24,v0.t +vfredmax.vs v8,v16,v24 +vfredmax.vs v8,v16,v24,v0.t +vfmerge.vfm v8,v16,ft0,v0 +vfmv.v.f v8,ft0 +vmfeq.vv v8,v16,v24 +vmfeq.vv v8,v16,v24,v0.t +vmfeq.vf v8,v16,ft0 +vmfeq.vf v8,v16,ft0,v0.t +vmfle.vv v8,v16,v24 +vmfle.vv v8,v16,v24,v0.t +vmfle.vf v8,v16,ft0 +vmfle.vf v8,v16,ft0,v0.t +vmflt.vv v8,v16,v24 +vmflt.vv v8,v16,v24,v0.t +vmflt.vf v8,v16,ft0 +vmflt.vf v8,v16,ft0,v0.t +vmfne.vv v8,v16,v24 +vmfne.vv v8,v16,v24,v0.t +vmfne.vf v8,v16,ft0 +vmfne.vf v8,v16,ft0,v0.t +vmfgt.vv v8,v16,v24 +vmfgt.vv v8,v16,v24,v0.t +vmfgt.vf v8,v16,ft0 +vmfgt.vf v8,v16,ft0,v0.t +vmfge.vv v8,v16,v24 +vmfge.vv v8,v16,v24,v0.t +vmfge.vf v8,v16,ft0 +vmfge.vf v8,v16,ft0,v0.t +vfdiv.vv v8,v16,v24 +vfdiv.vv v8,v16,v24,v0.t +vfdiv.vf v8,v16,ft0 +vfdiv.vf v8,v16,ft0,v0.t +vfrdiv.vf v8,v16,ft0 +vfrdiv.vf v8,v16,ft0,v0.t +vfmul.vv v8,v16,v24 +vfmul.vv v8,v16,v24,v0.t +vfmul.vf v8,v16,ft0 +vfmul.vf v8,v16,ft0,v0.t +vfrsub.vf v8,v16,ft0 +vfrsub.vf v8,v16,ft0,v0.t +vfmadd.vv v8,v16,v24 +vfmadd.vv v8,v16,v24,v0.t +vfmadd.vf v8,ft0,v16 +vfmadd.vf v8,ft0,v16,v0.t +vfmsub.vv v8,v16,v24 +vfmsub.vv v8,v16,v24,v0.t +vfmsub.vf v8,ft0,v16 +vfmsub.vf v8,ft0,v16,v0.t +vfmacc.vv v8,v16,v24 +vfmacc.vv v8,v16,v24,v0.t +vfmacc.vf v8,ft0,v16 +vfmacc.vf v8,ft0,v16,v0.t +vfmsac.vv v8,v16,v24 +vfmsac.vv v8,v16,v24,v0.t +vfmsac.vf v8,ft0,v16 +vfmsac.vf v8,ft0,v16,v0.t +vfnmsac.vv v8,v16,v24 +vfnmsac.vv v8,v16,v24,v0.t +vfnmsac.vf v8,ft0,v16 +vfnmsac.vf v8,ft0,v16,v0.t +vfnmacc.vv v8,v16,v24 +vfnmacc.vv v8,v16,v24,v0.t +vfnmacc.vf v8,ft0,v16 +vfnmacc.vf v8,ft0,v16,v0.t +vfnmsub.vv v8,v16,v24 +vfnmsub.vv v8,v16,v24,v0.t +vfnmsub.vf v8,ft0,v16 +vfnmsub.vf v8,ft0,v16,v0.t +vfnmadd.vv v8,v16,v24 +vfnmadd.vv v8,v16,v24,v0.t +vfnmadd.vf v8,ft0,v16 +vfnmadd.vf v8,ft0,v16,v0.t +vwredsumu.vs v8,v16,v24 +vwredsumu.vs v8,v16,v24,v0.t +vwredsum.vs v8,v16,v24 +vwredsum.vs v8,v16,v24,v0.t +vfwadd.vv v8,v16,v24 +vfwadd.vv v8,v16,v24,v0.t +vfwadd.vf v8,v16,ft0 +vfwadd.vf v8,v16,ft0,v0.t +vfwsub.vv v8,v16,v24 +vfwsub.vv v8,v16,v24,v0.t +vfwsub.vf v8,v16,ft0 +vfwsub.vf v8,v16,ft0,v0.t +vfwadd.wv v8,v16,v24 +vfwadd.wv v8,v16,v24,v0.t +vfwadd.wf v8,v16,ft0 +vfwadd.wf v8,v16,ft0,v0.t +vfwsub.wv v8,v16,v24 +vfwsub.wv v8,v16,v24,v0.t +vfwsub.wf v8,v16,ft0 +vfwsub.wf v8,v16,ft0,v0.t +vfwmul.vv v8,v16,v24 +vfwmul.vv v8,v16,v24,v0.t +vfwmul.vf v8,v16,ft0 +vfwmul.vf v8,v16,ft0,v0.t +vfwmacc.vv v8,v16,v24 +vfwmacc.vv v8,v16,v24,v0.t +vfwmacc.vf v8,ft0,v16 +vfwmacc.vf v8,ft0,v16,v0.t +vfwnmacc.vv v8,v16,v24 +vfwnmacc.vv v8,v16,v24,v0.t +vfwnmacc.vf v8,ft0,v16 +vfwnmacc.vf v8,ft0,v16,v0.t +vfwmsac.vv v8,v16,v24 +vfwmsac.vv v8,v16,v24,v0.t +vfwmsac.vf v8,ft0,v16 +vfwmsac.vf v8,ft0,v16,v0.t +vfwnmsac.vv v8,v16,v24 +vfwnmsac.vv v8,v16,v24,v0.t +vfwnmsac.vf v8,ft0,v16 +vfwnmsac.vf v8,ft0,v16,v0.t +vfwredosum.vs v8,v16,v24 +vfwredosum.vs v8,v16,v24,v0.t +vfwredusum.vs v8,v16,v24 +vfwredusum.vs v8,v16,v24,v0.t +vmv.s.x v8,t0 +vmv.x.s t0,v8 +vcpop.m t0,v8 +vcpop.m t0,v8,v0.t +vfirst.m t0,v8 +vfirst.m t0,v8,v0.t +vzext.vf2 v8,v16 +vzext.vf2 v8,v16,v0.t +vsext.vf2 v8,v16 +vsext.vf2 v8,v16,v0.t +vzext.vf4 v8,v16 +vzext.vf4 v8,v16,v0.t +vsext.vf4 v8,v16 +vsext.vf4 v8,v16,v0.t +vzext.vf8 v8,v16 +vzext.vf8 v8,v16,v0.t +vsext.vf8 v8,v16 +vsext.vf8 v8,v16,v0.t +vfmv.f.s ft0,v8 +vfmv.s.f v8,ft0 +vfcvt.xu.f.v v8,v16 +vfcvt.xu.f.v v8,v16,v0.t +vfcvt.x.f.v v8,v16 +vfcvt.x.f.v v8,v16,v0.t +vfcvt.f.xu.v v8,v16 +vfcvt.f.xu.v v8,v16,v0.t +vfcvt.f.x.v v8,v16 +vfcvt.f.x.v v8,v16,v0.t +vfcvt.rtz.x.f.v v8,v16 +vfcvt.rtz.x.f.v v8,v16,v0.t +vfcvt.rtz.xu.f.v v8,v16 +vfcvt.rtz.xu.f.v v8,v16,v0.t +vfwcvt.xu.f.v v8,v16 +vfwcvt.xu.f.v v8,v16,v0.t +vfwcvt.x.f.v v8,v16 +vfwcvt.x.f.v v8,v16,v0.t +vfwcvt.f.xu.v v8,v16 +vfwcvt.f.xu.v v8,v16,v0.t +vfwcvt.f.x.v v8,v16 +vfwcvt.f.x.v v8,v16,v0.t +vfwcvt.f.f.v v8,v16 +vfwcvt.f.f.v v8,v16,v0.t +vfwcvt.rtz.xu.f.v v8,v16 +vfwcvt.rtz.xu.f.v v8,v16,v0.t +vfwcvt.rtz.x.f.v v8,v16 +vfwcvt.rtz.x.f.v v8,v16,v0.t +vfncvt.xu.f.w v8,v16 +vfncvt.xu.f.w v8,v16,v0.t +vfncvt.x.f.w v8,v16 +vfncvt.x.f.w v8,v16,v0.t +vfncvt.f.xu.w v8,v16 +vfncvt.f.xu.w v8,v16,v0.t +vfncvt.f.x.w v8,v16 +vfncvt.f.x.w v8,v16,v0.t +vfncvt.f.f.w v8,v16 +vfncvt.f.f.w v8,v16,v0.t +vfncvt.rtz.x.f.w v8,v16 +vfncvt.rtz.x.f.w v8,v16,v0.t +vfncvt.rtz.xu.f.w v8,v16 +vfncvt.rtz.xu.f.w v8,v16,v0.t +vfncvt.rod.f.f.w v8,v16 +vfncvt.rod.f.f.w v8,v16,v0.t +vfsqrt.v v8,v16 +vfsqrt.v v8,v16,v0.t +vfrsqrt7.v v8,v16 +vfrsqrt7.v v8,v16,v0.t +vfrec7.v v8,v16 +vfrec7.v v8,v16,v0.t +vfclass.v v8,v16 +vfclass.v v8,v16,v0.t +vmsbf.m v8,v16 +vmsbf.m v8,v16,v0.t +vmsof.m v8,v16 +vmsof.m v8,v16,v0.t +vmsif.m v8,v16 +vmsif.m v8,v16,v0.t +viota.m v8,v16 +viota.m v8,v16,v0.t +vid.v v8 +vid.v v8,v0.t diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index 8d51aa576..d1dd3788f 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -324,11 +324,11 @@ package ara_pkg; // Scalar response elen_t resp; - // Instruction triggered an error - logic error; + // Instruction triggered an exception + ariane_pkg::exception_t exception; // New value for vstart - vlen_t error_vl; + vlen_t exception_vstart; } ara_resp_t; //////////////////// diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv index 9b1b74463..8e12543c0 100644 --- a/hardware/src/ara.sv +++ b/hardware/src/ara.sv @@ -123,8 +123,8 @@ module ara import ara_pkg::*; #( pe_resp_t [NrPEs-1:0] pe_resp; // Interface with the address generator logic addrgen_ack; - logic addrgen_error; - vlen_t addrgen_error_vl; + ariane_pkg::exception_t addrgen_exception; + vlen_t addrgen_exception_vstart; logic [NrLanes-1:0] alu_vinsn_done; logic [NrLanes-1:0] mfpu_vinsn_done; // Interface with the operand requesters @@ -171,8 +171,8 @@ module ara import ara_pkg::*; #( .pe_scalar_resp_ready_o(pe_scalar_resp_ready ), // Interface with the address generator .addrgen_ack_i (addrgen_ack ), - .addrgen_error_i (addrgen_error ), - .addrgen_error_vl_i (addrgen_error_vl ) + .addrgen_exception_i (addrgen_exception ), + .addrgen_exception_vstart_i(addrgen_exception_vstart ) ); // Scalar move support @@ -337,8 +337,8 @@ module ara import ara_pkg::*; #( .pe_req_ready_o (pe_req_ready[NrLanes+OffsetStore : NrLanes+OffsetLoad]), .pe_resp_o (pe_resp[NrLanes+OffsetStore : NrLanes+OffsetLoad] ), .addrgen_ack_o (addrgen_ack ), - .addrgen_error_o (addrgen_error ), - .addrgen_error_vl_o (addrgen_error_vl ), + .addrgen_exception_o (addrgen_exception ), + .addrgen_exception_vstart_o (addrgen_exception_vstart ), // Interface with the Mask unit .mask_i (mask ), .mask_valid_i (mask_valid ), @@ -458,6 +458,9 @@ module ara import ara_pkg::*; #( if (ara_pkg::VLEN == 0) $error("[ara] The vector length must be greater than zero."); + if (ara_pkg::VLENB < 8 * NrLanes) + $error("[ara] Every vector register with LMUL1 should have at least 8 Byte/lane."); + if (ara_pkg::VLEN < ELEN) $error( "[ara] The vector length must be greater or equal than the maximum size of a single vector element" diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index 998e84230..b06afd097 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -53,17 +53,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // CSRs // //////////// - vlen_t vstart_d, vstart_q; - vlen_t vl_d, vl_q; - vtype_t vtype_d, vtype_q; - vxsat_e vxsat_d, vxsat_q; - vxrm_t vxrm_d, vxrm_q; - - `FF(vstart_q, vstart_d, '0) - `FF(vl_q, vl_d, '0) - `FF(vtype_q, vtype_d, '{vill: 1'b1, default: '0}) - `FF(vxsat_q, vxsat_d, '0) - `FF(vxrm_q, vxrm_d, '0) + vlen_t csr_vstart_d, csr_vstart_q; + vlen_t csr_vl_d, csr_vl_q; + vtype_t csr_vtype_d, csr_vtype_q; + vxsat_e csr_vxsat_d, csr_vxsat_q; + vxrm_t csr_vxrm_d, csr_vxrm_q; + + `FF(csr_vstart_q, csr_vstart_d, '0) + `FF(csr_vl_q, csr_vl_d, '0) + `FF(csr_vtype_q, csr_vtype_d, '{vill: 1'b1, default: '0}) + `FF(csr_vxsat_q, csr_vxsat_d, '0) + `FF(csr_vxrm_q, csr_vxrm_d, '0) // Converts between the internal representation of `vtype_t` and the full XLEN-bit CSR. function automatic riscv::xlen_t xlen_vtype(vtype_t vtype); xlen_vtype = {vtype.vill, {riscv::XLEN-9{1'b0}}, vtype.vma, vtype.vta, vtype.vsew, @@ -133,14 +133,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( typedef enum logic [1:0] { NORMAL_OPERATION, WAIT_IDLE, - RESHUFFLE, - SLDU_SEQUENCER + RESHUFFLE } state_e; state_e state_d, state_q; // We need to memorize the element width used to store each vector on the lanes, so that we are // able to deshuffle it when needed. rvv_pkg::vew_e [31:0] eew_d, eew_q; + // eew buffers for reshuffling + rvv_pkg::vew_e reshuffle_eew_vs1_d, reshuffle_eew_vs1_q; + rvv_pkg::vew_e reshuffle_eew_vs2_d, reshuffle_eew_vs2_q; + rvv_pkg::vew_e reshuffle_eew_vd_d, reshuffle_eew_vd_q; // If the reg was not written, the content is unknown. No need to reshuffle // when writing with != EEW logic [31:0] eew_valid_d, eew_valid_q; @@ -167,6 +170,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( rs_lmul_cnt_q <= '0; rs_lmul_cnt_limit_q <= '0; rs_mask_request_q <= 1'b0; + reshuffle_eew_vs1_q <= rvv_pkg::EW8; + reshuffle_eew_vs2_q <= rvv_pkg::EW8; + reshuffle_eew_vd_q <= rvv_pkg::EW8; end else begin state_q <= state_d; eew_q <= eew_d; @@ -178,6 +184,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( rs_lmul_cnt_q <= rs_lmul_cnt_d; rs_lmul_cnt_limit_q <= rs_lmul_cnt_limit_d; rs_mask_request_q <= rs_mask_request_d; + reshuffle_eew_vs1_q <= reshuffle_eew_vs1_d; + reshuffle_eew_vs2_q <= reshuffle_eew_vs2_d; + reshuffle_eew_vd_q <= reshuffle_eew_vd_d; end end @@ -193,26 +202,26 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // its counters of pending memory operations // Ara should tell Ariane when a memory operation is completed, so that it can modify // its pending load/store counters. - // A memory operation can be completed both when it is over and when vl_q == 0. In the latter case, + // A memory operation can be completed both when it is over and when csr_vl_q == 0. In the latter case, // Ara's decoder answers immediately, and this can cause a collision with an answer from Ara's VLSU. - // To avoid collisions, we give precedence to the VLSU, and we delay the vl_q == 0 memory op + // To avoid collisions, we give precedence to the VLSU, and we delay the csr_vl_q == 0 memory op // completion signal if a collision occurs logic load_zero_vl, store_zero_vl; // Do not checks vregs validity against current LMUL logic skip_lmul_checks; - logic skip_vs1_lmul_checks; // Are we decoding? logic is_decoding; // Is this an in-lane operation? logic in_lane_op; - // If the vslideup offset is greater than vl_q, the vslideup has no effects + // If the vslideup offset is greater than csr_vl_q, the vslideup has no effects logic null_vslideup; // Pipeline the VLSU's load and store complete signals, for timing reasons logic load_complete_q; logic store_complete_q; - `FF(load_complete_q, load_complete_i, 1'b0) - `FF(store_complete_q, store_complete_i, 1'b0) + logic illegal_insn_load, illegal_insn_store; + `FF(load_complete_q, load_complete_i || illegal_insn_load, 1'b0) + `FF(store_complete_q, store_complete_i || illegal_insn_store, 1'b0) // NP2 Slide support logic is_stride_np2; @@ -237,27 +246,32 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( always_comb begin: p_decoder // Default values - vstart_d = vstart_q; - vl_d = vl_q; - vtype_d = vtype_q; + csr_vstart_d = csr_vstart_q; + csr_vl_d = csr_vl_q; + csr_vtype_d = csr_vtype_q; state_d = state_q; eew_d = eew_q; eew_valid_d = eew_valid_q; - lmul_vs2 = vtype_q.vlmul; - lmul_vs1 = vtype_q.vlmul; + lmul_vs2 = csr_vtype_q.vlmul; + lmul_vs1 = csr_vtype_q.vlmul; - reshuffle_req_d = reshuffle_req_q; - eew_old_buffer_d = eew_old_buffer_q; - eew_new_buffer_d = eew_new_buffer_q; - vs_buffer_d = vs_buffer_q; + reshuffle_req_d = reshuffle_req_q; + eew_old_buffer_d = eew_old_buffer_q; + eew_new_buffer_d = eew_new_buffer_q; + vs_buffer_d = vs_buffer_q; + reshuffle_eew_vs1_d = reshuffle_eew_vs1_q; + reshuffle_eew_vs2_d = reshuffle_eew_vs2_q; + reshuffle_eew_vd_d = reshuffle_eew_vd_q; rs_lmul_cnt_d = '0; rs_lmul_cnt_limit_d = '0; rs_mask_request_d = 1'b0; illegal_insn = 1'b0; - vxsat_d = vxsat_q; - vxrm_d = vxrm_q; + illegal_insn_load = 1'b0; + illegal_insn_store = 1'b0; + csr_vxsat_d = csr_vxsat_q; + csr_vxrm_d = csr_vxrm_q; is_vload = 1'b0; is_vstore = 1'b0; @@ -265,15 +279,12 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( store_zero_vl = 1'b0; skip_lmul_checks = 1'b0; - skip_vs1_lmul_checks = 1'b0; null_vslideup = 1'b0; - is_decoding = 1'b0; - in_lane_op = 1'b0; + is_decoding = 1'b0; + in_lane_op = 1'b0; - acc_resp_o.req_ready = 1'b0; - acc_resp_o.resp_valid = 1'b0; acc_resp_o = '{ trans_id : acc_req_i.trans_id, load_complete : load_zero_vl | load_complete_q, @@ -282,18 +293,20 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( fflags_valid : |fflags_ex_valid_i, default : '0 }; + acc_resp_o.req_ready = 1'b0; + acc_resp_o.resp_valid = 1'b0; // fflags for (int lane = 0; lane < NrLanes; lane++) acc_resp_o.fflags |= fflags_ex_i[lane]; ara_req_d = '{ - vl : vl_q, - vstart : vstart_q, - vtype : vtype_q, - emul : vtype_q.vlmul, - eew_vs1 : vtype_q.vsew, - eew_vs2 : vtype_q.vsew, - eew_vd_op : vtype_q.vsew, + vl : csr_vl_q, + vstart : csr_vstart_q, + vtype : csr_vtype_q, + emul : csr_vtype_q.vlmul, + eew_vs1 : csr_vtype_q.vsew, + eew_vs2 : csr_vtype_q.vsew, + eew_vd_op : csr_vtype_q.vsew, eew_vmask : eew_q[VMASK], cvt_resize : CVT_SAME, default : '0 @@ -304,9 +317,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ignore_zero_vl_check = 1'b0; // Saturation in any lane will raise vxsat flag - vxsat_d |= |vxsat_flag_i; + csr_vxsat_d |= |vxsat_flag_i; // Fixed-point rounding mode is applied to all lanes - for (int lane = 0; lane < NrLanes; lane++) alu_vxrm_o[lane] = vxrm_q; + for (int lane = 0; lane < NrLanes; lane++) alu_vxrm_o[lane] = csr_vxrm_q; // Rounding mode is shared between all lanes for (int lane = 0; lane < NrLanes; lane++) acc_resp_o.fflags |= fflags_ex_i[lane]; // Special states @@ -330,6 +343,12 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( rs_lmul_cnt_limit_d = rs_lmul_cnt_limit_q; rs_mask_request_d = 1'b0; + // Every single reshuffle request refers to LMUL == 1 + ara_req_d.emul = LMUL_1; + + // vstart is always 0 for a reshuffle + ara_req_d.vstart = '0; + // These generate a reshuffle request to Ara's backend // When LMUL > 1, not all the regs that compose a large // register should always be reshuffled @@ -369,26 +388,35 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Prepare the information to reshuffle the vector registers during the next cycles // Reshuffle in the following order: vd, v2, v1. The order is arbitrary. + // If we are here, vd has been already reshuffled. unique casez (reshuffle_req_d) - 3'b??1: begin - eew_old_buffer_d = eew_q[insn.vmem_type.rd]; - eew_new_buffer_d = ara_req_d.vtype.vsew; - vs_buffer_d = insn.varith_type.rd; - end 3'b?10: begin eew_old_buffer_d = eew_q[insn.vmem_type.rs2]; - eew_new_buffer_d = ara_req_d.eew_vs2; + eew_new_buffer_d = reshuffle_eew_vs2_q; vs_buffer_d = insn.varith_type.rs2; end 3'b100: begin eew_old_buffer_d = eew_q[insn.vmem_type.rs1]; - eew_new_buffer_d = ara_req_d.eew_vs1; + eew_new_buffer_d = reshuffle_eew_vs1_q; vs_buffer_d = insn.varith_type.rs1; end default:; endcase - if (reshuffle_req_d == 3'b0) state_d = NORMAL_OPERATION; + if (reshuffle_req_d == 3'b0) begin + // If LMUL_X has X > 1, Ara can inject different reshuffle ops during RESHUFFLE, + // one per LMUL_1-register that needs to be reshuffled. In mixed cases, we have + // multiple instructions that reshuffle parts of the original LMUL_X-register + // (e.g., LMUL_8, vd = v0, eew = 64, and only v1 and v5 have eew = 64). In this + // case, the dependency of the next LMUL_8 instruction on v0 should be on all + // the reshuffle micro operations. This is not possible with the current architecture. + // Therefore, we either set the dependency on the very last instruction only, or + // we just wait until the reshuffle is over. + // The best optimization would be injecting contiguous reshuffles with X > 1 and + // an extended vl. If we injected only one reshuffle, we can skip the wait idle. + if (csr_vtype_q.vlmul != LMUL_1) state_d = WAIT_IDLE; + else state_d = NORMAL_OPERATION; + end // The register is not completely reshuffled (LMUL > 1) end else begin // Count up @@ -401,17 +429,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 3'b??1: begin vs_buffer_d = vs_buffer_q + 1; eew_old_buffer_d = eew_q[vs_buffer_d]; - eew_new_buffer_d = ara_req_d.vtype.vsew; + eew_new_buffer_d = reshuffle_eew_vd_q; end 3'b?10: begin vs_buffer_d = vs_buffer_q + 1; eew_old_buffer_d = eew_q[vs_buffer_d]; - eew_new_buffer_d = ara_req_d.eew_vs2; + eew_new_buffer_d = reshuffle_eew_vs2_q; end 3'b100: begin vs_buffer_d = vs_buffer_q + 1; eew_old_buffer_d = eew_q[vs_buffer_d]; - eew_new_buffer_d = ara_req_d.eew_vs1; + eew_new_buffer_d = reshuffle_eew_vs1_q; end default:; endcase @@ -428,7 +456,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Decoding is_decoding = 1'b1; // Acknowledge the request - acc_resp_o.req_ready = ara_req_ready_i; + acc_resp_o.req_ready = 1'b1; // Decode the instructions based on their opcode unique case (acc_req_i.insn.itype.opcode) @@ -440,7 +468,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Instruction is of one of the RVV types automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr); - // These always respond at the same cycle + // These (mostly) always respond at the same cycle acc_resp_o.resp_valid = 1'b1; // Decode based on their func3 field @@ -448,33 +476,33 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Configuration instructions OPCFG: begin: opcfg // These can be acknowledged regardless of the state of Ara - acc_resp_o.req_ready = 1'b1; + // NOTE: unless there is a pending fault-only first vector load is_config = 1'b1; // Update vtype if (insn.vsetvli_type.func1 == 1'b0) begin // vsetvli - vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetvli_type.zimm11)); + csr_vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetvli_type.zimm11)); end else if (insn.vsetivli_type.func2 == 2'b11) begin // vsetivli - vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetivli_type.zimm10)); + csr_vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetivli_type.zimm10)); end else if (insn.vsetvl_type.func7 == 7'b100_0000) begin // vsetvl - vtype_d = vtype_xlen(riscv::xlen_t'(acc_req_i.rs2[7:0])); + csr_vtype_d = vtype_xlen(riscv::xlen_t'(acc_req_i.rs2[7:0])); end else - acc_resp_o.error = 1'b1; + illegal_insn = 1'b1; // Check whether the updated vtype makes sense - if ((vtype_d.vsew > rvv_pkg::vew_e'($clog2(ELENB))) || // SEW <= ELEN - (vtype_d.vlmul == LMUL_RSVD) || // reserved value + if ((csr_vtype_d.vsew > rvv_pkg::vew_e'($clog2(ELENB))) || // SEW <= ELEN + (csr_vtype_d.vlmul == LMUL_RSVD) || // reserved value // LMUL >= SEW/ELEN - (signed'($clog2(ELENB)) + signed'(vtype_d.vlmul) < signed'(vtype_d.vsew))) begin - vtype_d = '{vill: 1'b1, default: '0}; - vl_d = '0; + (signed'($clog2(ELENB)) + signed'(csr_vtype_d.vlmul) < signed'(csr_vtype_d.vsew))) begin + csr_vtype_d = '{vill: 1'b1, default: '0}; + csr_vl_d = '0; end // Update the vector length else begin // Maximum vector length. VLMAX = LMUL * VLEN / SEW. - automatic int unsigned vlmax = VLENB >> vtype_d.vsew; - unique case (vtype_d.vlmul) + automatic int unsigned vlmax = VLENB >> csr_vtype_d.vsew; + unique case (csr_vtype_d.vlmul) LMUL_1 : vlmax <<= 0; LMUL_2 : vlmax <<= 1; LMUL_4 : vlmax <<= 2; @@ -487,24 +515,24 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase if (insn.vsetivli_type.func2 == 2'b11) begin // vsetivli - vl_d = vlen_t'(insn.vsetivli_type.uimm5); + csr_vl_d = vlen_t'(insn.vsetivli_type.uimm5); end else begin // vsetvl || vsetvli if (insn.vsetvl_type.rs1 == '0 && insn.vsetvl_type.rd == '0) begin // Do not update the vector length - vl_d = vl_q; + csr_vl_d = csr_vl_q; end else if (insn.vsetvl_type.rs1 == '0 && insn.vsetvl_type.rd != '0) begin // Set the vector length to vlmax - vl_d = vlmax; + csr_vl_d = vlmax; end else begin // Normal stripmining - vl_d = ((|acc_req_i.rs1[$bits(acc_req_i.rs1)-1:$bits(vl_d)]) || + csr_vl_d = ((|acc_req_i.rs1[$bits(acc_req_i.rs1)-1:$bits(csr_vl_d)]) || (vlen_t'(acc_req_i.rs1) > vlmax)) ? vlmax : vlen_t'(acc_req_i.rs1); end end end // Return the new vl - acc_resp_o.result = vl_d; + acc_resp_o.result = csr_vl_d; // If the vtype has changed, wait for the backend before issuing any new instructions. // This is to avoid hazards on implicit register labels when LMUL_old > LMUL_new @@ -512,7 +540,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Checking only lmul_q is a trick: we want to stall only if both lmuls have // zero MSB. If lmul_q has zero MSB, it's greater than lmul_d only if also // lmul_d has zero MSB since the slice comparison is intrinsically unsigned - if (!vtype_q.vlmul[2] && (vtype_d.vlmul[2:0] < vtype_q.vlmul[2:0])) + if (!csr_vtype_q.vlmul[2] && (csr_vtype_d.vlmul[2:0] < csr_vtype_q.vlmul[2:0])) state_d = WAIT_IDLE; end @@ -549,26 +577,10 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b010001: begin ara_req_d.op = ara_pkg::VMADC; - ara_req_d.use_vd_op = 1'b1; // Check whether we can access vs1 and vs2 - unique case (ara_req_d.emul) - LMUL_2: - if (((insn.varith_type.rs1 & 5'b00001) == (insn.varith_type.rd & 5'b00001)) || - ((insn.varith_type.rs2 & 5'b00001) == (insn.varith_type.rd & 5'b00001))) - illegal_insn = 1'b1; - LMUL_4: - if (((insn.varith_type.rs1 & 5'b00011) == (insn.varith_type.rd & 5'b00011)) || - ((insn.varith_type.rs2 & 5'b00011) == (insn.varith_type.rd & 5'b00011))) - illegal_insn = 1'b1; - LMUL_8: - if (((insn.varith_type.rs1 & 5'b00111) == (insn.varith_type.rd & 5'b00111)) || - ((insn.varith_type.rs2 & 5'b00111) == (insn.varith_type.rd & 5'b00111))) - illegal_insn = 1'b1; - default: - if ((insn.varith_type.rs1 == insn.varith_type.rd) || - (insn.varith_type.rs2 == insn.varith_type.rd)) illegal_insn = 1'b1; - endcase + if ((insn.varith_type.rs1 == insn.varith_type.rd) || + (insn.varith_type.rs2 == insn.varith_type.rd)) illegal_insn = 1'b1; end 6'b010010: begin ara_req_d.op = ara_pkg::VSBC; @@ -579,50 +591,28 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b010011: begin ara_req_d.op = ara_pkg::VMSBC; - ara_req_d.use_vd_op = 1'b1; // Check whether we can access vs1 and vs2 - unique case (ara_req_d.emul) - LMUL_2: - if (((insn.varith_type.rs1 & 5'b00001) == (insn.varith_type.rd & 5'b00001)) || - ((insn.varith_type.rs2 & 5'b00001) == ( insn.varith_type.rd & 5'b00001))) - illegal_insn = 1'b1; - LMUL_4: - if (((insn.varith_type.rs1 & 5'b00011) == (insn.varith_type.rd & 5'b00011)) || - ((insn.varith_type.rs2 & 5'b00011) == (insn.varith_type.rd & 5'b00011))) - illegal_insn = 1'b1; - LMUL_8: - if (((insn.varith_type.rs1 & 5'b00111) == (insn.varith_type.rd & 5'b00111)) || - ((insn.varith_type.rs2 & 5'b00111) == (insn.varith_type.rd & 5'b00111))) - illegal_insn = 1'b1; - default: - if ((insn.varith_type.rs1 == insn.varith_type.rd) || - (insn.varith_type.rs2 == insn.varith_type.rd)) illegal_insn = 1'b1; - endcase + if ((insn.varith_type.rs1 == insn.varith_type.rd) || + (insn.varith_type.rs2 == insn.varith_type.rd)) illegal_insn = 1'b1; end 6'b011000: begin ara_req_d.op = ara_pkg::VMSEQ; - ara_req_d.use_vd_op = 1'b1; end 6'b011001: begin ara_req_d.op = ara_pkg::VMSNE; - ara_req_d.use_vd_op = 1'b1; end 6'b011010: begin ara_req_d.op = ara_pkg::VMSLTU; - ara_req_d.use_vd_op = 1'b1; end 6'b011011: begin ara_req_d.op = ara_pkg::VMSLT; - ara_req_d.use_vd_op = 1'b1; end 6'b011100: begin ara_req_d.op = ara_pkg::VMSLEU; - ara_req_d.use_vd_op = 1'b1; end 6'b011101: begin ara_req_d.op = ara_pkg::VMSLE; - ara_req_d.use_vd_op = 1'b1; end 6'b010111: begin ara_req_d.op = ara_pkg::VMERGE; @@ -632,7 +622,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.varith_type.vm) begin ara_req_d.eew_vs1 = eew_q[ara_req_d.vs1]; ara_req_d.vtype.vsew = eew_q[ara_req_d.vs1]; - ara_req_d.vl = (vl_q << vtype_q.vsew[1:0]) >> ara_req_d.eew_vs1[1:0]; + ara_req_d.vl = (csr_vl_q << csr_vtype_q.vsew[1:0]) >> ara_req_d.eew_vs1[1:0]; end end 6'b100000: ara_req_d.op = ara_pkg::VSADDU; @@ -648,11 +638,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101100: begin ara_req_d.op = ara_pkg::VNSRL; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -666,11 +656,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101101: begin ara_req_d.op = ara_pkg::VNSRA; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -683,28 +673,28 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b101110: begin ara_req_d.op = ara_pkg::VNCLIPU; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 6'b101111: begin ara_req_d.op = ara_pkg::VNCLIP; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end // Reductions encode in cvt_resize the neutral value bits // CVT_WIDE is 2'b00 (hack to save wires) 6'b110000: begin ara_req_d.op = ara_pkg::VWREDSUMU; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.eew_vs1 = vtype_q.vsew.next(); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.eew_vs1 = csr_vtype_q.vsew.next(); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110001: begin ara_req_d.op = ara_pkg::VWREDSUM; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.eew_vs1 = vtype_q.vsew.next(); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.eew_vs1 = csr_vtype_q.vsew.next(); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; @@ -728,7 +718,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end OPIVX: begin: opivx @@ -758,7 +748,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001110: begin ara_req_d.op = ara_pkg::VSLIDEUP; ara_req_d.stride = acc_req_i.rs1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Encode vslideup/vslide1up on the use_scalar_op field ara_req_d.use_scalar_op = 1'b0; // Vl refers to current system vsew, but operand requesters @@ -766,13 +756,13 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // i.e., request will need reshuffling ara_req_d.scale_vl = 1'b1; // If stride > vl, the vslideup has no effects - if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || - (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; + if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] || + (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1; end 6'b001111: begin ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = acc_req_i.rs1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Encode vslidedown/vslide1down on the use_scalar_op field ara_req_d.use_scalar_op = 1'b0; // Request will need reshuffling @@ -789,21 +779,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b010001: begin ara_req_d.op = ara_pkg::VMADC; - ara_req_d.use_vd_op = 1'b1; // Check whether we can access vs1 and vs2 - unique case (ara_req_d.emul) - LMUL_2: - if ((insn.varith_type.rs2 & 5'b00001) == (insn.varith_type.rd & 5'b00001)) - illegal_insn = 1'b1; - LMUL_4: - if ((insn.varith_type.rs2 & 5'b00011) == (insn.varith_type.rd & 5'b00011)) - illegal_insn = 1'b1; - LMUL_8: - if ((insn.varith_type.rs2 & 5'b00111) == (insn.varith_type.rd & 5'b00111)) - illegal_insn = 1'b1; - default: if (insn.varith_type.rs2 == insn.varith_type.rd) illegal_insn = 1'b1; - endcase + if (insn.varith_type.rs2 == insn.varith_type.rd) illegal_insn = 1'b1; end 6'b010010: begin ara_req_d.op = ara_pkg::VSBC; @@ -816,53 +794,33 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b010011: begin ara_req_d.op = ara_pkg::VMSBC; - ara_req_d.use_vd_op = 1'b1; // Check whether we can access vs1 and vs2 - unique case (ara_req_d.emul) - LMUL_2: - if ((insn.varith_type.rs2 & 5'b00001) == (insn.varith_type.rd & 5'b00001)) - illegal_insn = 1'b1; - LMUL_4: - if ((insn.varith_type.rs2 & 5'b00011) == (insn.varith_type.rd & 5'b00011)) - illegal_insn = 1'b1; - LMUL_8: - if ((insn.varith_type.rs2 & 5'b00111) == (insn.varith_type.rd & 5'b00111)) - illegal_insn = 1'b1; - default: if (insn.varith_type.rs2 == insn.varith_type.rd) illegal_insn = 1'b1; - endcase + if (insn.varith_type.rs2 == insn.varith_type.rd) illegal_insn = 1'b1; end 6'b011000: begin ara_req_d.op = ara_pkg::VMSEQ; - ara_req_d.use_vd_op = 1'b1; end 6'b011001: begin ara_req_d.op = ara_pkg::VMSNE; - ara_req_d.use_vd_op = 1'b1; end 6'b011010: begin ara_req_d.op = ara_pkg::VMSLTU; - ara_req_d.use_vd_op = 1'b1; end 6'b011011: begin ara_req_d.op = ara_pkg::VMSLT; - ara_req_d.use_vd_op = 1'b1; end 6'b011100: begin ara_req_d.op = ara_pkg::VMSLEU; - ara_req_d.use_vd_op = 1'b1; end 6'b011101: begin ara_req_d.op = ara_pkg::VMSLE; - ara_req_d.use_vd_op = 1'b1; end 6'b011110: begin ara_req_d.op = ara_pkg::VMSGTU; - ara_req_d.use_vd_op = 1'b1; end 6'b011111: begin ara_req_d.op = ara_pkg::VMSGT; - ara_req_d.use_vd_op = 1'b1; end 6'b010111: begin ara_req_d.op = ara_pkg::VMERGE; @@ -881,11 +839,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101100: begin ara_req_d.op = ara_pkg::VNSRL; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -899,11 +857,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101101: begin ara_req_d.op = ara_pkg::VNSRA; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -916,11 +874,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b101110: begin ara_req_d.op = ara_pkg::VNCLIPU; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 6'b101111: begin ara_req_d.op = ara_pkg::VNCLIP; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end default: illegal_insn = 1'b1; endcase @@ -938,7 +896,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end OPIVI: begin: opivi @@ -966,19 +924,19 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001110: begin ara_req_d.op = ara_pkg::VSLIDEUP; ara_req_d.stride = {{ELEN{insn.varith_type.rs1[19]}}, insn.varith_type.rs1}; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Encode vslideup/vslide1up on the use_scalar_op field ara_req_d.use_scalar_op = 1'b0; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; // If stride > vl, the vslideup has no effects - if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || - (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; + if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] || + (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1; end 6'b001111: begin ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = {{ELEN{insn.varith_type.rs1[19]}}, insn.varith_type.rs1}; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Encode vslidedown/vslide1down on the use_scalar_op field ara_req_d.use_scalar_op = 1'b0; // Request will need reshuffling @@ -995,45 +953,27 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b010001: begin ara_req_d.op = ara_pkg::VMADC; - ara_req_d.use_vd_op = 1'b1; // Check whether we can access vs1 and vs2 - unique case (ara_req_d.emul) - LMUL_2: - if ((insn.varith_type.rs2 & 5'b00001) == (insn.varith_type.rd & 5'b00001)) - illegal_insn = 1'b1; - LMUL_4: - if ((insn.varith_type.rs2 & 5'b00011) == (insn.varith_type.rd & 5'b00011)) - illegal_insn = 1'b1; - LMUL_8: - if ((insn.varith_type.rs2 & 5'b00111) == (insn.varith_type.rd & 5'b00111)) - illegal_insn = 1'b1; - default: if (insn.varith_type.rs2 == insn.varith_type.rd) illegal_insn = 1'b1; - endcase + if (insn.varith_type.rs2 == insn.varith_type.rd) illegal_insn = 1'b1; end 6'b011000: begin ara_req_d.op = ara_pkg::VMSEQ; - ara_req_d.use_vd_op = 1'b1; end 6'b011001: begin ara_req_d.op = ara_pkg::VMSNE; - ara_req_d.use_vd_op = 1'b1; end 6'b011100: begin ara_req_d.op = ara_pkg::VMSLEU; - ara_req_d.use_vd_op = 1'b1; end 6'b011101: begin ara_req_d.op = ara_pkg::VMSLE; - ara_req_d.use_vd_op = 1'b1; end 6'b011110: begin ara_req_d.op = ara_pkg::VMSGTU; - ara_req_d.use_vd_op = 1'b1; end 6'b011111: begin ara_req_d.op = ara_pkg::VMSGT; - ara_req_d.use_vd_op = 1'b1; end 6'b010111: begin ara_req_d.op = ara_pkg::VMERGE; @@ -1091,11 +1031,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101100: begin ara_req_d.op = ara_pkg::VNSRL; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -1109,11 +1049,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101101: begin ara_req_d.op = ara_pkg::VNSRA; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -1126,11 +1066,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b101110: begin ara_req_d.op = ara_pkg::VNCLIPU; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 6'b101111: begin ara_req_d.op = ara_pkg::VNCLIP; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end default: illegal_insn = 1'b1; endcase @@ -1148,7 +1088,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end OPMVV: begin: opmvv @@ -1237,7 +1177,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ignore_zero_vl_check = 1'b1; // Sign extend operands - unique case (vtype_q.vsew) + unique case (csr_vtype_q.vsew) EW8: begin ara_req_d.conversion_vs2 = OpQueueConversionSExt8; end @@ -1251,12 +1191,12 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Wait until the back-end answers to acknowledge those instructions - if (ara_resp_valid_i) begin - acc_resp_o.req_ready = 1'b1; - acc_resp_o.result = ara_resp_i.resp; - acc_resp_o.error = ara_resp_i.error; - acc_resp_o.resp_valid = 1'b1; - ara_req_valid_d = 1'b0; + if ( ara_resp_valid_i ) begin + acc_resp_o.req_ready = 1'b1; + acc_resp_o.resp_valid = 1'b1; + acc_resp_o.result = ara_resp_i.resp; + acc_resp_o.exception = ara_resp_i.exception; + ara_req_valid_d = 1'b0; end end 6'b010100: begin @@ -1283,7 +1223,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b011001: begin ara_req_d.op = ara_pkg::VMAND; @@ -1291,7 +1230,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b011010: begin ara_req_d.op = ara_pkg::VMOR; @@ -1299,7 +1237,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b011011: begin ara_req_d.op = ara_pkg::VMXOR; @@ -1307,7 +1244,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b011100: begin ara_req_d.op = ara_pkg::VMORNOT; @@ -1315,7 +1251,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b011101: begin ara_req_d.op = ara_pkg::VMNAND; @@ -1323,7 +1258,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b011110: begin ara_req_d.op = ara_pkg::VMNOR; @@ -1331,7 +1265,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b011111: begin ara_req_d.op = ara_pkg::VMXNOR; @@ -1339,12 +1272,10 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b010010: begin // VXUNARY0 // These instructions do not use vs1 ara_req_d.use_vs1 = 1'b0; - skip_vs1_lmul_checks = 1'b1; // They are always encoded as ADDs with zero. ara_req_d.op = ara_pkg::VADD; ara_req_d.use_scalar_op = 1'b1; @@ -1357,8 +1288,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW64) || - int'(vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8}) + if (int'(csr_vtype_q.vsew) < int'(EW64) || + int'(csr_vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; end 5'b00011: begin // VSEXT.VF8 @@ -1367,44 +1298,44 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW64) || - int'(vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8}) + if (int'(csr_vtype_q.vsew) < int'(EW64) || + int'(csr_vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; end 5'b00100: begin // VZEXT.VF4 ara_req_d.conversion_vs2 = OpQueueConversionZExt4; - ara_req_d.eew_vs2 = prev_prev_ew(vtype_q.vsew); + ara_req_d.eew_vs2 = prev_prev_ew(csr_vtype_q.vsew); ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW32) || - int'(vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) < int'(EW32) || + int'(csr_vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; end 5'b00101: begin // VSEXT.VF4 ara_req_d.conversion_vs2 = OpQueueConversionSExt4; - ara_req_d.eew_vs2 = prev_prev_ew(vtype_q.vsew); + ara_req_d.eew_vs2 = prev_prev_ew(csr_vtype_q.vsew); ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW32) || - int'(vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) < int'(EW32) || + int'(csr_vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; end 5'b00110: begin // VZEXT.VF2 ara_req_d.conversion_vs2 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.prev(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.prev(); ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8}) + if (int'(csr_vtype_q.vsew) < int'(EW16) || int'(csr_vtype_q.vlmul) inside {LMUL_1_8}) illegal_insn = 1'b1; end 5'b00111: begin // VSEXT.VF2 ara_req_d.conversion_vs2 = OpQueueConversionSExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.prev(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.prev(); ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8}) + if (int'(csr_vtype_q.vsew) < int'(EW16) || int'(csr_vtype_q.vlmul) inside {LMUL_1_8}) illegal_insn = 1'b1; end default: illegal_insn = 1'b1; @@ -1444,92 +1375,92 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Widening instructions 6'b110000: begin // VWADDU ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110001: begin // VWADD ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110010: begin // VWSUBU ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110011: begin // VWSUB ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110100: begin // VWADDU.W ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110101: begin // VWADD.W ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110110: begin // VWSUBU.W ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110111: begin // VWSUB.W ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111000: begin // VWMULU ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b111010: begin // VWMULSU ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b111011: begin // VWMUL ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; @@ -1537,31 +1468,31 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b111100: begin // VWMACCU ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111101: begin // VWMACC ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111111: begin // VWMACCSU ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end default: illegal_insn = 1'b1; @@ -1572,21 +1503,21 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // destination register. if (!skip_lmul_checks) begin unique case (ara_req_d.emul) - LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vd; + LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vd; + LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vd; default:; endcase unique case (lmul_vs2) - LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vs2; + LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vs2; + LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vs2; default:; endcase unique case (lmul_vs1) - LMUL_2: if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4: if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8: if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2: if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vs1; + LMUL_4: if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vs1; + LMUL_8: if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vs1; default:; endcase end @@ -1595,7 +1526,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (int'(ara_req_d.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1; // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end OPMVX: begin: opmvx @@ -1620,17 +1551,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001110: begin // vslide1up ara_req_d.op = ara_pkg::VSLIDEUP; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; // If stride > vl, the vslideup has no effects - if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || - (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; + if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] || + (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1; end 6'b001111: begin // vslide1down ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; end @@ -1638,7 +1569,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // vmv.s.x ara_req_d.op = ara_pkg::VMVSX; ara_req_d.use_vs2 = 1'b0; - ara_req_d.vl = |vl_q ? 1 : '0; + ara_req_d.vl = |csr_vl_q ? 1 : '0; // This instruction ignores LMUL checks skip_lmul_checks = 1'b1; end @@ -1676,92 +1607,92 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Widening instructions 6'b110000: begin // VWADDU ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110001: begin // VWADD ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110010: begin // VWSUBU ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110011: begin // VWSUB ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110100: begin // VWADDU.W ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110101: begin // VWADD.W ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110110: begin // VWSUBU.W ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110111: begin // VWSUB.W ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111000: begin // VWMULU ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b111010: begin // VWMULSU ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b111011: begin // VWMUL ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; @@ -1769,41 +1700,41 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b111100: begin // VWMACCU ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111101: begin // VWMACC ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111110: begin // VWMACCUS ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111111: begin // VWMACCSU ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end default: illegal_insn = 1'b1; @@ -1814,15 +1745,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // destination register. if (!skip_lmul_checks) begin unique case (ara_req_d.emul) - LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vd; + LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vd; + LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vd; default:; endcase unique case (lmul_vs2) - LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vs2; + LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vs2; + LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vs2; default:; endcase end @@ -1831,7 +1762,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (int'(ara_req_d.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1; // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end OPFVV: begin: opfvv @@ -1901,7 +1832,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ignore_zero_vl_check = 1'b1; // Zero-extend operands - unique case (vtype_q.vsew) + unique case (csr_vtype_q.vsew) EW16: begin ara_req_d.conversion_vs2 = OpQueueConversionZExt4; end @@ -1912,7 +1843,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // NaN-box the result if needed - unique case (vtype_q.vsew) + unique case (csr_vtype_q.vsew) EW16: begin vfmvfs_result[63:16] = '1; vfmvfs_result[15:0] = ara_resp_i.resp[15:0]; @@ -1926,11 +1857,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Wait until the back-end answers to acknowledge those instructions if (ara_resp_valid_i) begin - acc_resp_o.req_ready = 1'b1; - acc_resp_o.result = vfmvfs_result; - acc_resp_o.error = ara_resp_i.error; - acc_resp_o.resp_valid = 1'b1; - ara_req_valid_d = 1'b0; + acc_resp_o.req_ready = 1'b1; + acc_resp_o.resp_valid = 1'b1; + acc_resp_o.result = vfmvfs_result; + acc_resp_o.exception = ara_resp_i.exception; + ara_req_valid_d = 1'b0; end end 6'b011000: ara_req_d.op = ara_pkg::VMFEQ; @@ -1940,7 +1871,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b010010: begin // VFUNARY0 // These instructions do not use vs1 ara_req_d.use_vs1 = 1'b0; - skip_vs1_lmul_checks = 1'b1; case (insn.varith_type.rs1) 5'b00000: ara_req_d.op = VFCVTXUF; @@ -1952,103 +1882,101 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01000: begin // Widening VFCVTXUF ara_req_d.op = VFCVTXUF; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01001: begin // Widening VFCVTXF ara_req_d.op = VFCVTXF; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01010: begin // Widening VFCVTFXU ara_req_d.op = VFCVTFXU; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01011: begin // Widening VFCVTFX ara_req_d.op = VFCVTFX; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01100: begin // Widening VFCVTFF ara_req_d.op = VFCVTFF; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01110: begin // Widening VFCVTRTZXUF ara_req_d.op = VFCVTRTZXUF; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01111: begin // Widening VFCVTRTZXF ara_req_d.op = VFCVTRTZXF; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b10000: begin // Narrowing VFCVTXUF ara_req_d.op = VFCVTXUF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10001: begin // Narrowing VFCVTXF ara_req_d.op = VFCVTXF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10010: begin // Narrowing VFCVTFXU ara_req_d.op = VFCVTFXU; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10011: begin // Narrowing VFCVTFX ara_req_d.op = VFCVTFX; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10100: begin // Narrowing VFCVTFF ara_req_d.op = VFCVTFF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10101: begin // Narrowing VFNCVTRODFF ara_req_d.op = VFNCVTRODFF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10110: begin // Narrowing VFCVTRTZXUF ara_req_d.op = VFCVTRTZXUF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10111: begin // Narrowing VFCVTRTZXF ara_req_d.op = VFCVTRTZXF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end default: begin // Trigger an error - acc_resp_o.error = 1'b1; - ara_req_valid_d = 1'b0; + illegal_insn = 1'b1; end endcase end 6'b010011: begin // VFUNARY1 // These instructions do not use vs1 ara_req_d.use_vs1 = 1'b0; - skip_vs1_lmul_checks = 1'b1; unique case (insn.varith_type.rs1) 5'b00000: ara_req_d.op = ara_pkg::VFSQRT; @@ -2104,99 +2032,99 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b110000: begin // VFWADD ara_req_d.op = ara_pkg::VFADD; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; end 6'b110001: begin // VFWREDUSUM ara_req_d.op = ara_pkg::VFWREDUSUM; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vs1 = vtype_q.vsew.next(); + ara_req_d.eew_vs1 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = resize_e'(2'b00); end 6'b110010: begin // VFWSUB ara_req_d.op = ara_pkg::VFSUB; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; end 6'b110011: begin // VFWREDOSUM ara_req_d.op = ara_pkg::VFWREDOSUM; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vs1 = vtype_q.vsew.next(); + ara_req_d.eew_vs1 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = resize_e'(2'b00); end 6'b110100: begin // VFWADD.W ara_req_d.op = ara_pkg::VFADD; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; end 6'b110110: begin // VFWSUB.W ara_req_d.op = ara_pkg::VFSUB; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; end 6'b111000: begin // VFWMUL ara_req_d.op = ara_pkg::VFMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; end 6'b111100: begin // VFWMACC ara_req_d.op = ara_pkg::VFMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111101: begin // VFWNMACC ara_req_d.op = ara_pkg::VFNMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111110: begin // VFWMSAC ara_req_d.op = ara_pkg::VFMSAC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111111: begin // VFWNMSAC ara_req_d.op = ara_pkg::VFNMSAC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end default: illegal_insn = 1'b1; endcase @@ -2206,28 +2134,26 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // destination register. if (!skip_lmul_checks) begin unique case (ara_req_d.emul) - LMUL_2 : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4 : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8 : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2 : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vd; + LMUL_4 : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vd; + LMUL_8 : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vd; LMUL_RSVD: illegal_insn = 1'b1; default:; endcase unique case (lmul_vs2) - LMUL_2 : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4 : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8 : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2 : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vs2; + LMUL_4 : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vs2; + LMUL_8 : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vs2; + LMUL_RSVD: illegal_insn = 1'b1; + default:; + endcase + unique case (lmul_vs1) + LMUL_2 : if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vs1; + LMUL_4 : if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vs1; + LMUL_8 : if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vs1; LMUL_RSVD: illegal_insn = 1'b1; default:; endcase - if (!skip_vs1_lmul_checks) begin - unique case (lmul_vs1) - LMUL_2 : if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4 : if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8 : if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = 1'b1; - LMUL_RSVD: illegal_insn = 1'b1; - default:; - endcase - end end // Ara can support 16-bit float, 32-bit float, 64-bit float. @@ -2252,7 +2178,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end else illegal_insn = 1'b1; // Vector FP instructions are disabled end @@ -2291,17 +2217,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001110: begin // vfslide1up ara_req_d.op = ara_pkg::VSLIDEUP; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; // If stride > vl, the vslideup has no effects - if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || - (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; + if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] || + (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1; end 6'b001111: begin // vfslide1down ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; end @@ -2309,7 +2235,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // vmv.s.f ara_req_d.op = ara_pkg::VFMVSF; ara_req_d.use_vs2 = 1'b0; - ara_req_d.vl = |vl_q ? 1 : '0; + ara_req_d.vl = |csr_vl_q ? 1 : '0; // This instruction ignores LMUL checks skip_lmul_checks = 1'b1; end @@ -2370,85 +2296,85 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b110000: begin // VFWADD ara_req_d.op = ara_pkg::VFADD; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; end 6'b110010: begin // VFWSUB ara_req_d.op = ara_pkg::VFSUB; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; end 6'b110100: begin // VFWADD.W ara_req_d.op = ara_pkg::VFADD; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.wide_fp_imm = 1'b1; end 6'b110110: begin // VFWSUB.W ara_req_d.op = ara_pkg::VFSUB; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.wide_fp_imm = 1'b1; end 6'b111000: begin // VFWMUL ara_req_d.op = ara_pkg::VFMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; end 6'b111100: begin // VFWMACC ara_req_d.op = ara_pkg::VFMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111101: begin // VFWNMACC ara_req_d.op = ara_pkg::VFNMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111110: begin // VFWMSAC ara_req_d.op = ara_pkg::VFMSAC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111111: begin // VFWNMSAC ara_req_d.op = ara_pkg::VFNMSAC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end default: illegal_insn = 1'b1; endcase // Check if the FP scalar operand is NaN-boxed. If not, replace it with a NaN. - case (vtype_q.vsew) + case (csr_vtype_q.vsew) EW16: if (~(&acc_req_i.rs1[63:16])) ara_req_d.scalar_op = 64'h0000000000007e00; EW32: if (~(&acc_req_i.rs1[63:32])) ara_req_d.scalar_op = 64'h000000007fc00000; endcase @@ -2458,16 +2384,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // destination register. if (!skip_lmul_checks) begin unique case (ara_req_d.emul) - LMUL_2 : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4 : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8 : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2 : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vd; + LMUL_4 : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vd; + LMUL_8 : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vd; LMUL_RSVD: illegal_insn = 1'b1; default:; endcase unique case (lmul_vs2) - LMUL_2 : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1; - LMUL_4 : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1; - LMUL_8 : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1; + LMUL_2 : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vs2; + LMUL_4 : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vs2; + LMUL_8 : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vs2; LMUL_RSVD: illegal_insn = 1'b1; default:; endcase @@ -2491,7 +2417,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end else illegal_insn = 1'b1; // Vector FP instructions are disabled end endcase @@ -2525,7 +2451,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW8; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW8; end end @@ -2533,7 +2459,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW16; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW16; end end @@ -2541,7 +2467,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW32; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW32; end end @@ -2549,15 +2475,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW64; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW64; end end default: begin // Invalid. Element is too wide, or encoding is non-existant. acc_resp_o.req_ready = 1'b1; - acc_resp_o.error = 1'b1; acc_resp_o.resp_valid = 1'b1; - ara_req_valid_d = 1'b0; + illegal_insn = 1'b1; + ara_req_valid_d = 1'b0; end endcase @@ -2572,19 +2498,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01000:; // Unit-strided, whole registers 5'b01011: begin // Unit-strided, mask load, EEW=1 // We operate ceil(vl/8) bytes - ara_req_d.vl = (vl_q >> 3) + |vl_q[2:0]; + ara_req_d.vl = (csr_vl_q >> 3) + |csr_vl_q[2:0]; ara_req_d.vtype.vsew = EW8; end 5'b10000: begin // Unit-strided, fault-only first // TODO: Not implemented - illegal_insn = 1'b1; - acc_resp_o.req_ready = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_load = 1'b1; end default: begin // Reserved - illegal_insn = 1'b1; - acc_resp_o.req_ready = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_load = 1'b1; end endcase end @@ -2604,24 +2526,22 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // For memory operations: EMUL = LMUL * (EEW / SEW) // EEW is encoded in the instruction - ara_req_d.emul = vlmul_e'(vtype_q.vlmul + (ara_req_d.vtype.vsew - vtype_q.vsew)); + ara_req_d.emul = vlmul_e'(csr_vtype_q.vlmul + (ara_req_d.vtype.vsew - csr_vtype_q.vsew)); // Exception if EMUL > 8 or < 1/8 - unique case ({vtype_q.vlmul[2], ara_req_d.emul[2]}) + unique case ({csr_vtype_q.vlmul[2], ara_req_d.emul[2]}) // The new emul is lower than the previous lmul 2'b01: begin // But the new eew is greater than vsew - if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) > 0) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) > 0) begin + illegal_insn_load = 1'b1; end end // The new emul is greater than the previous lmul 2'b10: begin // But the new eew is lower than vsew - if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) < 0) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) < 0) begin + illegal_insn_load = 1'b1; end end default:; @@ -2631,20 +2551,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // access. unique case (ara_req_d.emul) LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_load = 1'b1; end LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_load = 1'b1; end LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_load = 1'b1; end LMUL_RSVD: begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_load = 1'b1; end default:; endcase @@ -2654,9 +2570,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Execute also if vl == 0 ignore_zero_vl_check = 1'b1; // The LMUL value is kept in the instruction itself - illegal_insn = 1'b0; - acc_resp_o.req_ready = 1'b0; - acc_resp_o.resp_valid = 1'b0; + illegal_insn_load = 1'b0; ara_req_valid_d = 1'b1; // Maximum vector length. VLMAX = nf * VLEN / EW8. @@ -2680,20 +2594,21 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end default: begin // Trigger an error for the reserved simm values - illegal_insn = 1'b1; + illegal_insn_load = 1'b1; end endcase end // Wait until the back-end answers to acknowledge those instructions - if (ara_resp_valid_i) begin + if ( ara_resp_valid_i ) begin acc_resp_o.req_ready = 1'b1; - acc_resp_o.error = ara_resp_i.error; acc_resp_o.resp_valid = 1'b1; - ara_req_valid_d = 1'b0; - // In case of error, modify vstart - if (ara_resp_i.error) - vstart_d = ara_resp_i.error_vl; + acc_resp_o.exception = ara_resp_i.exception; + ara_req_valid_d = 1'b0; + // In case of exception, modify vstart + if ( ara_resp_i.exception.valid ) begin + csr_vstart_d = ara_resp_i.exception_vstart; + end end end @@ -2738,7 +2653,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW8; // ara_req_d.vtype.vsew is the target EEW! end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW8; end end @@ -2746,7 +2661,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW16; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW16; end end @@ -2754,7 +2669,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW32; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW32; end end @@ -2762,15 +2677,12 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW64; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW64; end end default: begin // Invalid. Element is too wide, or encoding is non-existant. - acc_resp_o.req_ready = 1'b1; - acc_resp_o.error = 1'b1; - acc_resp_o.resp_valid = 1'b1; - ara_req_valid_d = 1'b0; + illegal_insn = 1'b1; end endcase @@ -2785,13 +2697,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01000:; // Unit-strided, whole registers 5'b01011: begin // Unit-strided, mask store, EEW=1 // We operate ceil(vl/8) bytes - ara_req_d.vl = (vl_q >> 3) + |vl_q[2:0]; + ara_req_d.vl = (csr_vl_q >> 3) + |csr_vl_q[2:0]; ara_req_d.vtype.vsew = EW8; end default: begin // Reserved - illegal_insn = 1'b1; - acc_resp_o.req_ready = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_store = 1'b1; end endcase end @@ -2811,24 +2721,22 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // For memory operations: EMUL = LMUL * (EEW / SEW) // EEW is encoded in the instruction - ara_req_d.emul = vlmul_e'(vtype_q.vlmul + (ara_req_d.vtype.vsew - vtype_q.vsew)); + ara_req_d.emul = vlmul_e'(csr_vtype_q.vlmul + (ara_req_d.vtype.vsew - csr_vtype_q.vsew)); // Exception if EMUL > 8 or < 1/8 - unique case ({vtype_q.vlmul[2], ara_req_d.emul[2]}) + unique case ({csr_vtype_q.vlmul[2], ara_req_d.emul[2]}) // The new emul is lower than the previous lmul 2'b01: begin // But the new eew is greater than vsew - if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) > 0) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) > 0) begin + illegal_insn_store = 1'b1; end end // The new emul is greater than the previous lmul 2'b10: begin // But the new eew is lower than vsew - if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) < 0) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) < 0) begin + illegal_insn_store = 1'b1; end end default:; @@ -2838,20 +2746,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // access. unique case (ara_req_d.emul) LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_store = 1'b1; end LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_store = 1'b1; end LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_store = 1'b1; end LMUL_RSVD: begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_store = 1'b1; end default:; endcase @@ -2861,6 +2765,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (ara_req_d.op == VSE && insn.vmem_type.rs2 == 5'b01000) begin // Execute also if vl == 0 ignore_zero_vl_check = 1'b1; + illegal_insn_store = 1'b0; // Maximum vector length. VLMAX = nf * VLEN / EW8. ara_req_d.vtype.vsew = EW8; @@ -2883,25 +2788,25 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end default: begin // Trigger an error for the reserved simm values - illegal_insn = 1'b1; + illegal_insn_store = 1'b1; end endcase - illegal_insn = 1'b0; acc_resp_o.req_ready = 1'b0; acc_resp_o.resp_valid = 1'b0; ara_req_valid_d = 1'b1; end // Wait until the back-end answers to acknowledge those instructions - if (ara_resp_valid_i) begin + if ( ara_resp_valid_i ) begin acc_resp_o.req_ready = 1'b1; - acc_resp_o.error = ara_resp_i.error; acc_resp_o.resp_valid = 1'b1; - ara_req_valid_d = 1'b0; - // If there is an error, change vstart - if (ara_resp_i.error) - vstart_d = ara_resp_i.error_vl; + acc_resp_o.exception = ara_resp_i.exception; + ara_req_valid_d = 1'b0; + // In case of exception, modify vstart + if ( ara_resp_i.exception.valid ) begin + csr_vstart_d = ara_resp_i.exception_vstart; + end end end @@ -2910,184 +2815,240 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( //////////////////////////// riscv::OpcodeSystem: begin - // These always respond at the same cycle - acc_resp_o.resp_valid = 1'b1; - is_config = 1'b1; - - unique case (acc_req_i.insn.itype.funct3) - 3'b001: begin // csrrw - // Decode the CSR. - case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - // Only vstart can be written with CSR instructions. - riscv::CSR_VSTART: begin - vstart_d = acc_req_i.rs1; - acc_resp_o.result = vstart_q; - end - riscv::CSR_VXRM: begin - vxrm_d = vxrm_t'(acc_req_i.rs1[1:0]); - acc_resp_o.result = vlen_t'(vxrm_q); - end - riscv::CSR_VXSAT: begin - vxsat_d = vxsat_e'(acc_req_i.rs1[0]); - acc_resp_o.result = vlen_t'(vxsat_q); - end - default: acc_resp_o.error = 1'b1; - endcase - end - 3'b010: begin // csrrs - // Decode the CSR. - case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - riscv::CSR_VSTART: begin - vstart_d = vstart_q | vlen_t'(acc_req_i.rs1); - acc_resp_o.result = vstart_q; - end - riscv::CSR_VTYPE: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q); - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VL: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VLENB: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VXRM: begin - vxrm_d = vxrm_q | vxrm_t'(acc_req_i.rs1[1:0]); - acc_resp_o.result = vlen_t'(vxrm_q); - end - riscv::CSR_VXSAT: begin - vxsat_d = vxsat_q | vxsat_e'(acc_req_i.rs1[0]); - acc_resp_o.result = vlen_t'(vxsat_q); - end - default: acc_resp_o.error = 1'b1; - endcase - end - 3'b011: begin // csrrc - // Decode the CSR. - case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - riscv::CSR_VSTART: begin - vstart_d = vstart_q & ~vlen_t'(acc_req_i.rs1); - acc_resp_o.result = vstart_q; - end - riscv::CSR_VTYPE: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q); - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VL: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VLENB: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VXSAT: begin - vxsat_d = vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]); - acc_resp_o.result = vxsat_q; - end - default: acc_resp_o.error = 1'b1; - endcase - end - 3'b101: begin // csrrwi - // Decode the CSR. - case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - // Only vstart can be written with CSR instructions. - riscv::CSR_VSTART: begin - vstart_d = vlen_t'(acc_req_i.insn.itype.rs1); - acc_resp_o.result = vstart_q; - end - riscv::CSR_VXRM: begin - vxrm_d = vxrm_t'(acc_req_i.rs1[1:0]); - acc_resp_o.result = vlen_t'(vxrm_q); - end - riscv::CSR_VXSAT: begin - // logic [19:15] rs1; So, LSB is [15] - vxsat_d = acc_req_i.insn.itype.rs1[15]; - acc_resp_o.result = vxsat_q; - end - default: acc_resp_o.error = 1'b1; - endcase - end - 3'b110: begin // csrrsi - // Decode the CSR. - case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - riscv::CSR_VSTART: begin - vstart_d = vstart_q | vlen_t'(acc_req_i.insn.itype.rs1); - acc_resp_o.result = vstart_q; - end - riscv::CSR_VTYPE: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q); - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VL: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VLENB: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VXSAT: begin - // logic [19:15] rs1; So, LSB is [15] - vxsat_d = vxsat_q | vxsat_e'(acc_req_i.insn.itype.rs1[15]); - acc_resp_o.result = vxsat_q; - end - default: acc_resp_o.error = 1'b1; - endcase - end - 3'b111: begin // csrrci - // Decode the CSR. - unique case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - riscv::CSR_VSTART: begin - vstart_d = vstart_q & ~vlen_t'(acc_req_i.insn.itype.rs1); - acc_resp_o.result = vstart_q; - end - riscv::CSR_VTYPE: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q); - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VL: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VLENB: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VXSAT: begin - // logic [19:15] rs1; So, LSB is [15] - vxsat_d = vxsat_q & ~vxsat_e'(acc_req_i.insn.itype.rs1[15]); - acc_resp_o.result = vxsat_q; - end - default: acc_resp_o.error = 1'b1; - endcase - end - default: begin - // Trigger an illegal instruction - acc_resp_o.error = 1'b1; - acc_resp_o.resp_valid = 1'b1; - end - endcase + // CSR ops have semantic dependency from vector instrucitons. + // Therefore, Ara must be idle before performing any CSR operation. + + // Stall if there is any pending vector instruction + // NOTE: This is overconstraining. Not all CSR ops actually need to stall if a vector instruction is pending. + // E.g., CSR vl is never updated by instructions past ara_dispatcher, except for "unit-stride fault-only-first loads". Reading vl would be safe otherwise. + // E.g., CSR vlenb is a design-constant parameter, reading is always safe. + // E.g., CSRs vxrm and vxsat have no influence on-non fixed-point instructions, it could be read and written safely when no fixed-point operation is running. + // By better analyzing the spec, more of optimizations of such can be made. For the sake of simplicity, the current implementation treats CSR ops as one block. + if ( ara_idle_i ) begin + // These always respond at the same cycle + acc_resp_o.resp_valid = 1'b1; + is_config = 1'b1; + + unique case (acc_req_i.insn.itype.funct3) + 3'b001: begin // csrrw + // Decode the CSR. + case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + // Only vstart can be written with CSR instructions. + riscv::CSR_VSTART: begin + csr_vstart_d = acc_req_i.rs1; + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VXRM: begin + csr_vxrm_d = vxrm_t'(acc_req_i.rs1[16:15]); + acc_resp_o.result = vlen_t'(csr_vxrm_q); + end + riscv::CSR_VXSAT: begin + csr_vxsat_d = vxsat_e'(acc_req_i.rs1[15]); + acc_resp_o.result = vlen_t'(csr_vxsat_q); + end + riscv::CSR_VCSR: begin + csr_vxrm_d = vxrm_t'( acc_req_i.rs1[17:16] ); + csr_vxsat_d = vxsat_e'( acc_req_i.rs1[15] ); + acc_resp_o.result = vlen_t'( { csr_vxrm_q, csr_vxsat_q } ); + end + default: illegal_insn = 1'b1; + endcase + end + 3'b010: begin // csrrs + // Decode the CSR. + case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + riscv::CSR_VSTART: begin + csr_vstart_d = csr_vstart_q | vlen_t'(acc_req_i.rs1); + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VTYPE: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q); + else illegal_insn = 1'b1; + end + riscv::CSR_VL: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q; + else illegal_insn = 1'b1; + end + riscv::CSR_VLENB: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; + else illegal_insn = 1'b1; + end + riscv::CSR_VXRM: begin + csr_vxrm_d = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[16:15]); + acc_resp_o.result = vlen_t'(csr_vxrm_q); + end + riscv::CSR_VXSAT: begin + csr_vxsat_d = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[15]); + acc_resp_o.result = vlen_t'(csr_vxsat_q); + end + riscv::CSR_VCSR: begin + csr_vxrm_d = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[17:16]); + csr_vxsat_d = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[15]); + acc_resp_o.result = vlen_t'( { csr_vxrm_q, csr_vxsat_q } ); + end + default: illegal_insn = 1'b1; + endcase + end + 3'b011: begin // csrrc + // Decode the CSR. + case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + riscv::CSR_VSTART: begin + csr_vstart_d = csr_vstart_q & ~vlen_t'(acc_req_i.rs1); + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VTYPE: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q); + else illegal_insn = 1'b1; + end + riscv::CSR_VL: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q; + else illegal_insn = 1'b1; + end + riscv::CSR_VLENB: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; + else illegal_insn = 1'b1; + end + riscv::CSR_VXSAT: begin + csr_vxsat_d = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = csr_vxsat_q; + end + riscv::CSR_VXRM: begin + csr_vxrm_d = csr_vxrm_q & ~vxsat_e'(acc_req_i.rs1[1:0]); + acc_resp_o.result = csr_vxrm_q; + end + riscv::CSR_VCSR: begin + csr_vxrm_d = csr_vxrm_q & ~vxsat_e'(acc_req_i.rs1[2:1]); + csr_vxsat_d = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = vlen_t'( { csr_vxrm_q, csr_vxsat_q } ); + end + default: illegal_insn = 1'b1; + endcase + end + 3'b101: begin // csrrwi + // Decode the CSR. + case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + // Only vstart can be written with CSR instructions. + riscv::CSR_VSTART: begin + csr_vstart_d = vlen_t'(acc_req_i.rs1); + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VXRM: begin + csr_vxrm_d = vxrm_t'(acc_req_i.rs1[1:0]); + acc_resp_o.result = vlen_t'(csr_vxrm_q); + end + riscv::CSR_VXSAT: begin + csr_vxsat_d = acc_req_i.rs1[0]; + acc_resp_o.result = csr_vxsat_q; + end + riscv::CSR_VCSR: begin + // logic [19:15] rs1; So, LSB is [15] + csr_vxrm_d = vxrm_t'(acc_req_i.rs1[2:1]); + csr_vxsat_d = vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = csr_vxsat_q; + end + default: illegal_insn = 1'b1; + endcase + end + 3'b110: begin // csrrsi + // Decode the CSR. + case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + riscv::CSR_VSTART: begin + csr_vstart_d = csr_vstart_q | vlen_t'(acc_req_i.rs1); + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VTYPE: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q); + else illegal_insn = 1'b1; + end + riscv::CSR_VL: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q; + else illegal_insn = 1'b1; + end + riscv::CSR_VLENB: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; + else illegal_insn = 1'b1; + end + riscv::CSR_VXSAT: begin + // logic [19:15] rs1; So, LSB is [15] + csr_vxsat_d = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = csr_vxsat_q; + end + riscv::CSR_VXRM: begin + // logic [19:15] rs1; So, LSB is [15] + csr_vxrm_d = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[1:0]); + acc_resp_o.result = csr_vxrm_q; + end + riscv::CSR_VCSR: begin + // logic [19:15] rs1; So, LSB is [15] + csr_vxrm_d = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[2:1]); + csr_vxsat_d = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = { csr_vxrm_q, csr_vxsat_q }; + end + default: illegal_insn = 1'b1; + endcase + end + 3'b111: begin // csrrci + // Decode the CSR. + unique case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + riscv::CSR_VSTART: begin + csr_vstart_d = csr_vstart_q & ~vlen_t'(acc_req_i.rs1); + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VTYPE: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q); + else illegal_insn = 1'b1; + end + riscv::CSR_VL: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q; + else illegal_insn = 1'b1; + end + riscv::CSR_VLENB: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; + else illegal_insn = 1'b1; + end + riscv::CSR_VXSAT: begin + csr_vxsat_d = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = csr_vxsat_q; + end + riscv::CSR_VXRM: begin + csr_vxrm_d = csr_vxrm_q & ~vxsat_e'(acc_req_i.rs1[1:0]); + acc_resp_o.result = csr_vxrm_q; + end + riscv::CSR_VCSR: begin + // logic [19:15] rs1; So, LSB is [15] + csr_vxrm_d = csr_vxrm_q & ~vxrm_t'(acc_req_i.rs1[2:1]); + csr_vxsat_d = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = { csr_vxrm_q, csr_vxsat_q }; + end + default: illegal_insn= 1'b1; + endcase + end + default: begin + // Trigger an illegal instruction + illegal_insn = 1'b1; + end + endcase // acc_req_i.insn.itype.funct3 + end + else begin + acc_resp_o.req_ready = 1'b0; + end end default: begin // Trigger an illegal instruction - acc_resp_o.error = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn = 1'b1; end endcase end @@ -3101,9 +3062,19 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (ara_req_valid_d && (ara_req_d.op inside {VFREC7, VFRSQRT7}) && (FPExtSupport == FPExtSupportDisable)) illegal_insn = 1'b1; + // Raise an illegal instruction exception + if ( illegal_insn || illegal_insn_load || illegal_insn_store ) begin + ara_req_valid_d = 1'b0; + acc_resp_o.req_ready = 1'b1; + acc_resp_o.resp_valid = 1'b1; + acc_resp_o.exception.valid = 1'b1; + acc_resp_o.exception.cause = riscv::ILLEGAL_INSTR; + acc_resp_o.exception.tval = acc_req_i.insn; + end + // Check if we need to reshuffle our vector registers involved in the operation // This operation is costly when occurs, so avoid it if possible - if (ara_req_valid_d && !acc_resp_o.error) begin + if ( ara_req_valid_d && !acc_resp_o.exception.valid ) begin automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr); // Is the instruction an in-lane one and could it be subject to reshuffling? @@ -3114,7 +3085,12 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Optimization: reshuffle vd only if we are not overwriting the whole vector register! reshuffle_req_d = {ara_req_d.use_vs1 && (ara_req_d.eew_vs1 != eew_q[ara_req_d.vs1]) && eew_valid_q[ara_req_d.vs1] && in_lane_op, ara_req_d.use_vs2 && (ara_req_d.eew_vs2 != eew_q[ara_req_d.vs2]) && eew_valid_q[ara_req_d.vs2] && in_lane_op, - ara_req_d.use_vd && (ara_req_d.vtype.vsew != eew_q[ara_req_d.vd ]) && eew_valid_q[ara_req_d.vd ] && vl_q != (VLENB >> ara_req_d.vtype.vsew)}; + ara_req_d.use_vd && (ara_req_d.vtype.vsew != eew_q[ara_req_d.vd ]) && eew_valid_q[ara_req_d.vd ] && csr_vl_q != ((VLENB << ara_req_d.emul[1:0]) >> ara_req_d.vtype.vsew)}; + // Mask out requests if they refer to the same register! + reshuffle_req_d &= { + (insn.varith_type.rs1 != insn.varith_type.rs2) && (insn.varith_type.rs1 != insn.varith_type.rd), + (insn.varith_type.rs2 != insn.varith_type.rd), + 1'b1}; // Prepare the information to reshuffle the vector registers during the next cycles // Reshuffle in the following order: vd, v2, v1. The order is arbitrary. @@ -3156,17 +3132,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( default: rs_lmul_cnt_limit_d = 0; endcase + // Save info for next reshuffles + reshuffle_eew_vs1_d = ara_req_d.eew_vs1; + reshuffle_eew_vs2_d = ara_req_d.eew_vs2; + reshuffle_eew_vd_d = ara_req_d.vtype.vsew; + // Reshuffle state_d = RESHUFFLE; end end - // Raise an illegal instruction exception - if (illegal_insn) begin - acc_resp_o.error = 1'b1; - ara_req_valid_d = 1'b0; - end - // Update the EEW if (ara_req_valid_d && ara_req_d.use_vd && ara_req_ready_i) begin unique case (ara_req_d.emul) @@ -3205,8 +3180,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Any valid non-config instruction is a NOP if vl == 0, with some exceptions, // e.g. whole vector memory operations / whole vector register move - if (is_decoding && (vl_q == '0 || null_vslideup) && !is_config && - !ignore_zero_vl_check && !acc_resp_o.error) begin + if (is_decoding && (csr_vstart_q >= csr_vl_q || null_vslideup) && !is_config && + !ignore_zero_vl_check && !acc_resp_o.exception.valid) begin // If we are acknowledging a memory operation, we must tell Ariane that the memory // operation was resolved (to decrement its pending load/store counter) // This can collide with the same signal from the vector load/store unit, so we must @@ -3218,6 +3193,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( store_zero_vl = is_vstore; end + // Reset vstart to zero for successful vector instructions + // Corner cases: + // * vstart exception reporting, e.g., VLSU, is handled above + // * CSR operations are not considered vector instructions + if ( acc_resp_o.resp_valid + & !acc_resp_o.exception.valid + & (acc_req_i.insn.itype.opcode != riscv::OpcodeSystem) + ) begin + csr_vstart_d = '0; + end + acc_resp_o.load_complete = load_zero_vl | load_complete_q; acc_resp_o.store_complete = store_zero_vl | store_complete_q; diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv index 5fb0abff1..74fce4573 100644 --- a/hardware/src/ara_sequencer.sv +++ b/hardware/src/ara_sequencer.sv @@ -40,8 +40,8 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i output logic pe_scalar_resp_ready_o, // Interface with the Address Generation input logic addrgen_ack_i, - input logic addrgen_error_i, - input vlen_t addrgen_error_vl_i + input ariane_pkg::exception_t addrgen_exception_i, + input vlen_t addrgen_exception_vstart_i ); /////////////////////////////////// @@ -438,8 +438,8 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i state_d = IDLE; ara_req_ready_o = 1'b1; ara_resp_valid_o = 1'b1; - ara_resp_o.error = addrgen_error_i; - ara_resp_o.error_vl = addrgen_error_vl_i; + ara_resp_o.exception = addrgen_exception_i; + ara_resp_o.exception_vstart = addrgen_exception_vstart_i; end // Wait for the scalar result diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index 386b9823c..d93b79fda 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -652,7 +652,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: if ((operand_request_i[AluA].vl << (int'(EW64) - int'(pe_req.eew_vs1))) * NrLanes != pe_req.vl) operand_request_i[AluA].vl += 1; end - operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE]}); + operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE], VCPOP, VMSIF, VMSOF, VMSBF}); operand_request_i[AluB] = '{ id : pe_req.id, @@ -679,7 +679,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: if ((operand_request_i[AluB].vl << (int'(EW64) - int'(pe_req.eew_vs2))) * NrLanes != pe_req.vl) operand_request_i[AluB].vl += 1; end - operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE]}); + operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE], VCPOP, VMSIF, VMSOF, VMSBF, VFIRST}); operand_request_i[MulFPUA] = '{ id : pe_req.id, @@ -695,7 +695,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // This is an operation that runs normally on the ALU, and then gets *condensed* and // reshuffled at the Mask Unit. operand_request_i[MulFPUA].vl = vfu_operation_d.vl; - operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]}; + operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]} && !(pe_req.op inside {VCPOP, VMSIF, VMSOF, VMSBF}); operand_request_i[MulFPUB] = '{ id : pe_req.id, @@ -710,24 +710,26 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // This is an operation that runs normally on the ALU, and then gets *condensed* and // reshuffled at the Mask Unit. operand_request_i[MulFPUB].vl = vfu_operation_d.vl; - operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]}; + operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]} && !(pe_req.op inside {VCPOP, VMSIF, VMSOF, VMSBF, VFIRST}); operand_request_i[MaskB] = '{ id : pe_req.id, - vs : pe_req.vd, - eew : pe_req.eew_vd_op, + vs : pe_req.vs2, + eew : pe_req.eew_vs2, scale_vl: pe_req.scale_vl, vtype : pe_req.vtype, // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. vl : (pe_req.vl / NrLanes / ELEN) << (int'(EW64) - int'(pe_req.vtype.vsew)), vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vd, + hazard : (pe_req.op inside {VMSBF, VMSOF, VMSIF}) ? pe_req.hazard_vs2 : pe_req.hazard_vs2 | pe_req.hazard_vd, default : '0 }; - if (((pe_req.vl / NrLanes / ELEN) * NrLanes * ELEN) != - pe_req.vl) operand_request_i[MaskB].vl += 1; - operand_request_push[MaskB] = pe_req.use_vd_op; + operand_request_i[MaskB].vl = pe_req.vl / (NrLanes * (8 << pe_req.vtype.vsew)); + if ((pe_req.vl % (NrLanes*ELEN)) != 0) begin + operand_request_i[MaskB].vl += 1'b1; + end + operand_request_push[MaskB] = pe_req.use_vs2 && pe_req.op inside {VCPOP, VFIRST, VMSIF, VMSOF, VMSBF}; operand_request_i[MaskM] = '{ id : pe_req.id, diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv index 9bfc120f4..f9c447d62 100644 --- a/hardware/src/lane/valu.sv +++ b/hardware/src/lane/valu.sv @@ -795,7 +795,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; ////////////////////////////// if (!vinsn_queue_full && vfu_operation_valid_i && - (vfu_operation_i.vfu == VFU_Alu || vfu_operation_i.op inside {[VMSEQ:VMXNOR]})) begin + (vfu_operation_i.vfu == VFU_Alu || (vfu_operation_i.op inside {[VMSEQ:VMXNOR]} && + !(vfu_operation_i.op inside {VCPOP, VMSIF, VMSOF, VMSBF, VFIRST})))) begin vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i; // Do not wait for masks if, during a reduction, this lane is just a pass-through // The only valid instructions here with vl == '0 are reductions diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv index da6fb002a..855f89440 100644 --- a/hardware/src/masku/masku.sv +++ b/hardware/src/masku/masku.sv @@ -50,6 +50,20 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( import cf_math_pkg::idx_width; + // Pointers + // + // We need a pointer to which bit on the full VRF word we are reading mask operands from. + logic [idx_width(DataWidth*NrLanes):0] mask_pnt_d, mask_pnt_q; + // We need a pointer to which bit on the full VRF word we are writing results to. + logic [idx_width(DataWidth*NrLanes):0] vrf_pnt_d, vrf_pnt_q; + + // Remaining elements of the current instruction in the read operand phase + vlen_t read_cnt_d, read_cnt_q; + // Remaining elements of the current instruction in the issue phase + vlen_t issue_cnt_d, issue_cnt_q; + // Remaining elements of the current instruction in the commit phase + vlen_t commit_cnt_d, commit_cnt_q; + //////////////// // Operands // //////////////// @@ -57,39 +71,130 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Information about which is the target FU of the request masku_fu_e masku_operand_fu; - // ALU/FPU result - elen_t [NrLanes-1:0] masku_operand_a_i; - logic [NrLanes-1:0] masku_operand_a_valid_i; - logic [NrLanes-1:0] masku_operand_a_ready_o; + // ALU/FPU result (shuffled) + elen_t [NrLanes-1:0] masku_operand_alu; + logic [NrLanes-1:0] masku_operand_alu_valid; + logic [NrLanes-1:0] masku_operand_alu_ready; - // Previous value of the destination vector register - elen_t [NrLanes-1:0] masku_operand_b_i; - logic [NrLanes-1:0] masku_operand_b_valid_i; - logic [NrLanes-1:0] masku_operand_b_ready_o; + // ALU/FPU result (deshuffled) + logic [NrLanes*ELEN-1:0] masku_operand_alu_seq; + + // vs2 (shuffled) + elen_t [NrLanes-1:0] masku_operand_vs2; + logic [NrLanes-1:0] masku_operand_vs2_valid; + logic [NrLanes-1:0] masku_operand_vs2_ready; + + assign masku_operand_vs2_ready = 1'b0; + + // vs2 (deshuffled) + logic [NrLanes*ELEN-1:0] masku_operand_vs2_seq; + logic [ NrLanes-1:0] masku_operand_vs2_seq_valid; + logic [ NrLanes-1:0] masku_operand_vs2_seq_ready; // Mask - elen_t [NrLanes-1:0] masku_operand_m_i; - logic [NrLanes-1:0] masku_operand_m_valid_i; - logic [NrLanes-1:0] masku_operand_m_ready_o; + elen_t [NrLanes-1:0] masku_operand_m; + logic [NrLanes-1:0] masku_operand_m_valid; + logic [NrLanes-1:0] masku_operand_m_ready; + + // Mask deshuffled + logic [NrLanes*ELEN-1:0] masku_operand_m_seq; + logic [NrLanes-1:0] masku_operand_m_seq_valid; + logic [NrLanes-1:0] masku_operand_m_seq_ready; // Insn-queue related signal pe_req_t vinsn_issue; - for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_unpack_masku_operands - assign masku_operand_a_i[lane] = masku_operand_i[lane][2 + masku_operand_fu]; - assign masku_operand_a_valid_i[lane] = masku_operand_valid_i[lane][2 + masku_operand_fu]; - for (genvar operand_fu = 0; operand_fu < NrMaskFUnits; operand_fu++) begin: gen_masku_operand_ready - assign masku_operand_ready_o[lane][2 + operand_fu] = (masku_fu_e'(operand_fu) == masku_operand_fu) && masku_operand_a_ready_o[lane]; - end: gen_masku_operand_ready + logic [NrLanes*ELEN-1:0] bit_enable_mask; + logic [NrLanes*ELEN-1:0] bit_enable_shuffle; + logic [NrLanes*ELEN-1:0] alu_result_compressed; + + // Performs all shuffling and deshuffling of mask operands (including masks for mask instructions) + // Furthermore, it buffers certain operands that would create long critical paths + masku_operands #( + .NrLanes ( NrLanes ) + ) i_masku_operands ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + // Control logic + .masku_fu_i ( masku_operand_fu ), + .vinsn_issue_i ( vinsn_issue ), + .vrf_pnt_i ( vrf_pnt_q ), + // Operands coming from lanes + .masku_operand_valid_i ( masku_operand_valid_i ), + .masku_operand_ready_o ( masku_operand_ready_o ), + .masku_operands_i ( masku_operand_i ), + // Operands prepared for mask unit execution + .masku_operand_alu_o ( masku_operand_alu ), + .masku_operand_alu_valid_o ( masku_operand_alu_valid ), + .masku_operand_alu_ready_i ( masku_operand_alu_ready ), + .masku_operand_alu_seq_o ( masku_operand_alu_seq ), + .masku_operand_alu_seq_valid_o ( ), + .masku_operand_alu_seq_ready_i ( ), + .masku_operand_vs2_o ( masku_operand_vs2 ), + .masku_operand_vs2_valid_o ( masku_operand_vs2_valid ), + .masku_operand_vs2_ready_i ( masku_operand_vs2_ready ), + .masku_operand_vs2_seq_o ( masku_operand_vs2_seq ), + .masku_operand_vs2_seq_valid_o ( masku_operand_vs2_seq_valid ), + .masku_operand_vs2_seq_ready_i ( masku_operand_vs2_seq_ready ), + .masku_operand_m_o ( masku_operand_m ), + .masku_operand_m_valid_o ( masku_operand_m_valid ), + .masku_operand_m_ready_i ( masku_operand_m_ready ), + .masku_operand_m_seq_o ( masku_operand_m_seq ), + .masku_operand_m_seq_valid_o ( ), + .masku_operand_m_seq_ready_i ( ), + .bit_enable_mask_o ( bit_enable_mask ), + .shuffled_vl_bit_mask_o ( bit_enable_shuffle ), + .alu_result_compressed_o ( alu_result_compressed ) + ); + - assign masku_operand_b_i[lane] = masku_operand_i[lane][1]; - assign masku_operand_b_valid_i[lane] = (vinsn_issue.op inside {[VMSBF:VID]}) ? '1 : masku_operand_valid_i[lane][1]; - assign masku_operand_ready_o[lane][1] = masku_operand_b_ready_o[lane]; + // Local Parameter W_CPOP and W_VFIRST + // + // Description: Parameters W_CPOP and W_VFIRST enable time multiplexing of vcpop.m and vfirst.m instruction. + // + // Legal range W_CPOP: {64, 128, ... , DataWidth*NrLanes} // DataWidth = 64 + // Legal range W_VFIRST: {64, 128, ... , DataWidth*NrLanes} // DataWidth = 64 + // + // Execution time example for vcpop.m (similar for vfirst.m): + // W_CPOP = 64; VLEN = 1024; vl = 1024 + // t_vcpop.m = VLEN/W_CPOP = 8 [Cycles] + localparam int W_CPOP = 64; + localparam int W_VFIRST = 64; + // derived parameters + localparam int MAX_W_CPOP_VFIRST = (W_CPOP > W_VFIRST) ? W_CPOP : W_VFIRST; + localparam int N_SLICES_CPOP = NrLanes * DataWidth / W_CPOP; + localparam int N_SLICES_VFIRST = NrLanes * DataWidth / W_VFIRST; + // Check if parameters are within range + if (((W_CPOP & (W_CPOP - 1)) != 0) || (W_CPOP < 64)) begin + $fatal(1, "Parameter W_CPOP must be power of 2."); + end else if (((W_VFIRST & (W_VFIRST - 1)) != 0) || (W_VFIRST < 64)) begin + $fatal(1, "Parameter W_VFIRST must be power of 2."); + end + + // VFIRST and VCPOP Signals + logic [NrLanes*ELEN-1:0] vcpop_operand; + logic [$clog2(W_VFIRST):0] popcount; + logic [$clog2(VLEN):0] popcount_d, popcount_q; + logic [$clog2(W_VFIRST)-1:0] vfirst_count; + logic [$clog2(VLEN)-1:0] vfirst_count_d, vfirst_count_q; + logic vfirst_empty; + logic [NrLanes-1:0] vcpop_vfirst_vs2_ready; + // counter to keep track of how many slices of the vcpop_operand have been processed + logic [$clog2(MAX_W_CPOP_VFIRST):0] vcpop_slice_cnt_d, vcpop_slice_cnt_q; + logic [W_CPOP-1:0] vcpop_slice; + logic [W_VFIRST-1:0] vfirst_slice; + + // vmsbf, vmsif, vmsof, viota, vid, vcpop, vfirst variables + logic [NrLanes*DataWidth-1:0] alu_result_f, alu_result_ff; + logic [NrLanes*DataWidth-1:0] masku_operand_alu_seq_m, masku_operand_alu_seq_f, masku_operand_alu_seq_ff; + logic [NrLanes*DataWidth-1:0] alu_result_vm, alu_result_vm_m, alu_result_vm_seq; + logic [NrLanes*DataWidth-1:0] alu_src_idx, alu_src_idx_m; + logic [ 13:0] iteration_count_d, iteration_count_q; + logic not_found_one_d, not_found_one_q; + logic [ NrLanes-1:0] vmsif_vmsof_vmsbf_vs2_ready; - assign masku_operand_m_i[lane] = masku_operand_i[lane][0]; - assign masku_operand_m_valid_i[lane] = masku_operand_valid_i[lane][0]; - assign masku_operand_ready_o[lane][0] = masku_operand_m_ready_o[lane]; - end: gen_unpack_masku_operands + // Control flow for mask operands + assign masku_operand_vs2_seq_ready = vcpop_vfirst_vs2_ready | vmsif_vmsof_vmsbf_vs2_ready; //////////////////////////////// // Vector instruction queue // @@ -217,16 +322,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( logic result_queue_empty; assign result_queue_empty = (result_queue_cnt_q == '0); - // vmsbf, vmsif, vmsof, viota, vid, vcpop, vfirst variables - logic [NrLanes*DataWidth-1:0] alu_result_f, alu_result_ff; - logic [NrLanes*DataWidth-1:0] alu_operand_a, alu_operand_a_seq, alu_operand_a_seq_f; - logic [NrLanes*DataWidth-1:0] alu_operand_b, alu_operand_b_seq, alu_operand_b_seq_m, alu_operand_b_seq_f, alu_operand_b_seq_ff; - logic [NrLanes*DataWidth-1:0] alu_result_vm, alu_result_vm_m, alu_result_vm_seq; - logic [NrLanes*DataWidth-1:0] masku_operand_vd; - logic [NrLanes*DataWidth-1:0] alu_src_idx, alu_src_idx_m; - logic [4:0] iteration_count_d, iteration_count_q; - logic not_found_one_d, not_found_one_q; - always_ff @(posedge clk_i or negedge rst_ni) begin: p_result_queue_ff if (!rst_ni) begin result_queue_q <= '0; @@ -238,8 +333,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( alu_result_f <= '0; alu_result_ff <= '0; not_found_one_q <= 1'b1; - alu_operand_b_seq_f <= '0; - alu_operand_b_seq_ff <= '0; + masku_operand_alu_seq_f <= '0; + masku_operand_alu_seq_ff <= '0; iteration_count_q <= '0; end else begin result_queue_q <= result_queue_d; @@ -251,15 +346,15 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( alu_result_f <= (pe_req_ready_o) ? '0 : (!vinsn_issue.vm) ? alu_result_vm : alu_result_vm_seq; alu_result_ff <= alu_result_f; not_found_one_q <= not_found_one_d; - alu_operand_b_seq_f <= (pe_req_ready_o) ? '0 : alu_operand_b_seq_m; - alu_operand_b_seq_ff <= alu_operand_b_seq_f; + masku_operand_alu_seq_f <= (pe_req_ready_o) ? '0 : masku_operand_alu_seq_m; + masku_operand_alu_seq_ff <= masku_operand_alu_seq_f; iteration_count_q <= iteration_count_d; end end // iteration count for masked instrctions always_comb begin - if (vinsn_issue_valid && &masku_operand_a_valid_i) begin + if (vinsn_issue_valid && (&masku_operand_alu_valid || &masku_operand_vs2_seq_valid)) begin iteration_count_d = iteration_count_q + 1'b1; end else begin iteration_count_d = iteration_count_q; @@ -291,114 +386,49 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( //////////////// elen_t [NrLanes-1:0] alu_result; - logic [NrLanes*ELEN-1:0] bit_enable; - logic [NrLanes*ELEN-1:0] bit_enable_shuffle; - logic [NrLanes*ELEN-1:0] bit_enable_mask; - rvv_pkg::vew_e bit_enable_shuffle_eew; logic [NrLanes*ELEN-1:0] mask; - logic [NrLanes*ELEN-1:0] vcpop_operand; - logic [$clog2(DataWidth*NrLanes):0] popcount; - logic [$clog2(VLEN):0] popcount_d, popcount_q; - logic [$clog2(DataWidth*NrLanes)-1:0] vfirst_count; - logic [$clog2(VLEN)-1:0] vfirst_count_d, vfirst_count_q; - logic vfirst_empty; - // Pointers - // - // We need a pointer to which bit on the full VRF word we are reading mask operands from. - logic [idx_width(DataWidth*NrLanes):0] mask_pnt_d, mask_pnt_q; - // We need a pointer to which bit on the full VRF word we are writing results to. - logic [idx_width(DataWidth*NrLanes):0] vrf_pnt_d, vrf_pnt_q; + // keep track if first 1 mask element was found + logic vfirst_found; - // Remaining elements of the current instruction in the read operand phase - vlen_t read_cnt_d, read_cnt_q; - // Remaining elements of the current instruction in the issue phase - vlen_t issue_cnt_d, issue_cnt_q; - // Remaining elements of the current instruction in the commit phase - vlen_t commit_cnt_d, commit_cnt_q; + // assign operand slices to be processed by popcount and lzc + assign vcpop_slice = vcpop_operand[(vcpop_slice_cnt_q * W_CPOP) +: W_CPOP]; + assign vfirst_slice = vcpop_operand[(vcpop_slice_cnt_q * W_VFIRST) +: W_VFIRST]; // Population count for vcpop.m instruction popcount #( - .INPUT_WIDTH (DataWidth*NrLanes) + .INPUT_WIDTH (W_CPOP) ) i_popcount ( - .data_i (vcpop_operand), + .data_i (vcpop_slice), .popcount_o(popcount ) ); // Trailing zero counter lzc #( - .WIDTH(DataWidth*NrLanes), + .WIDTH(W_VFIRST), .MODE (0) ) i_clz ( - .in_i (vcpop_operand), + .in_i (vfirst_slice ), .cnt_o (vfirst_count ), .empty_o (vfirst_empty ) ); always_comb begin: p_mask_alu alu_result = '0; - bit_enable = '0; - bit_enable_shuffle = '0; - bit_enable_mask = '0; not_found_one_d = pe_req_ready_o ? 1'b1 : not_found_one_q; alu_result_vm = '0; alu_result_vm_m = '0; alu_result_vm_seq = '0; - alu_operand_b_seq = '0; - alu_operand_b_seq_m = '0; + masku_operand_alu_seq_m = '0; mask = '0; - masku_operand_vd = '0; vcpop_operand = '0; - // Comparisons work on vtype.vsew from VALU or VMFPU - bit_enable_shuffle_eew = vinsn_issue.op inside {[VMFEQ:VMSGTU], [VMSGT:VMSBC]} - ? vinsn_issue.vtype.vsew - : vinsn_issue.eew_vd_op; - if (vinsn_issue_valid) begin - // Calculate bit enable - // The result can be taken either from the result of an operation (mask_operand_a_i), or - // from the previous value of the destination register (mask_operand_b_i). Byte strobes - // do not work here, since this has to be done at a bit granularity. Therefore, the Mask Unit - // received both operands, and does a masking depending on the value of the vl. - if (vinsn_issue.vl >= ELEN*NrLanes) - bit_enable = '1; - else begin - bit_enable[vinsn_issue.vl] = 1'b1; - bit_enable = bit_enable - 1; - end - - // Shuffle the bit enable signal - for (int b = 0; b < NrLanes*StrbWidth; b++) begin - automatic int vrf_byte = shuffle_index(b, NrLanes, bit_enable_shuffle_eew); - bit_enable_shuffle[8*vrf_byte +: 8] = bit_enable[8*b +: 8]; - - // Take the mask into account - if (!vinsn_issue.vm) begin - automatic int mask_byte = shuffle_index(b, NrLanes, vinsn_issue.eew_vmask); - automatic int mask_byte_lane = mask_byte[idx_width(StrbWidth) +: idx_width(NrLanes)]; - automatic int mask_byte_offset = mask_byte[idx_width(StrbWidth)-1:0]; - bit_enable_mask[8*vrf_byte +: 8] = bit_enable_shuffle[8*vrf_byte +: 8] & - masku_operand_m_i[mask_byte_lane][8*mask_byte_offset +: 8]; - end else begin - bit_enable_mask[8*vrf_byte +: 8] = bit_enable_shuffle[8*vrf_byte +: 8]; - end - end - - alu_operand_a = masku_operand_a_i; - alu_operand_b = masku_operand_b_i; - - // Deshuffle the operands for the mask instructions - for (int b = 0; b < (NrLanes*StrbWidth); b++) begin - automatic int deshuffle_byte = deshuffle_index(b, NrLanes, vinsn_issue.vtype.vsew); - alu_operand_b_seq[8*deshuffle_byte +: 8] = alu_operand_a[8*b +: 8]; - masku_operand_vd [8*deshuffle_byte +: 8] = alu_operand_b[8*b +: 8]; - end // Mask generation unique case (vinsn_issue.op) inside [VMSBF:VID] : - if (&masku_operand_a_valid_i) begin + if (&masku_operand_alu_valid) begin unique case (vinsn_issue.vtype.vsew) EW8 : for (int i = 0; i < (DataWidth * NrLanes)/8; i++) mask [(i*8) +: 8] = {8{bit_enable_mask [i+(((DataWidth * NrLanes)/8)*(iteration_count_d-1))]}}; @@ -417,156 +447,77 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Evaluate the instruction unique case (vinsn_issue.op) inside - [VMANDNOT:VMXNOR]: alu_result = (masku_operand_a_i & bit_enable_mask) | - (masku_operand_b_i & ~bit_enable_mask); - [VMFEQ:VMSGTU], [VMSGT:VMSBC] : begin - automatic logic [ELEN*NrLanes-1:0] alu_result_flat = '0; - - unique case (vinsn_issue.vtype.vsew) - EW8: for (int b = 0; b < 8*NrLanes; b++) begin - // Shuffle the source byte, then find the lane and the offset of this byte in the - // full operand word. - automatic int src_byte = shuffle_index(1*b, NrLanes, EW8); - automatic int src_byte_lane = src_byte[idx_width(StrbWidth) +: idx_width(NrLanes)]; - automatic int src_byte_offset = src_byte[idx_width(StrbWidth)-1:0]; - - // Find the destination byte - automatic int dest_bit_seq = b + vrf_pnt_q; - automatic int dest_byte_seq = dest_bit_seq / StrbWidth; - automatic int dest_byte = shuffle_index(dest_byte_seq, NrLanes, EW8); - - alu_result_flat[StrbWidth*dest_byte + dest_bit_seq[idx_width(StrbWidth)-1:0]] = - (!vinsn_issue.vm && !masku_operand_a_i[src_byte_lane][8*src_byte_offset+1]) ? - masku_operand_b_i[src_byte_lane][8*src_byte_offset] : - masku_operand_a_i[src_byte_lane][8*src_byte_offset]; - end - EW16: for (int b = 0; b < 4*NrLanes; b++) begin - // Shuffle the source byte, then find the lane and the offset of this byte in the - // full operand word. - automatic int src_byte = shuffle_index(2*b, NrLanes, EW16); - automatic int src_byte_lane = src_byte[idx_width(StrbWidth) +: idx_width(NrLanes)]; - automatic int src_byte_offset = src_byte[idx_width(StrbWidth)-1:0]; - - // Find the destination byte - automatic int dest_bit_seq = b + vrf_pnt_q; - automatic int dest_byte_seq = dest_bit_seq / StrbWidth; - automatic int dest_byte = shuffle_index(dest_byte_seq, NrLanes, EW16); - - alu_result_flat[StrbWidth*dest_byte + dest_bit_seq[idx_width(StrbWidth)-1:0]] = - (!vinsn_issue.vm && !masku_operand_a_i[src_byte_lane][8*src_byte_offset+1]) ? - masku_operand_b_i[src_byte_lane][8*src_byte_offset] : - masku_operand_a_i[src_byte_lane][8*src_byte_offset]; - end - EW32: for (int b = 0; b < 2*NrLanes; b++) begin - // Shuffle the source byte, then find the lane and the offset of this byte in the - // full operand word. - automatic int src_byte = shuffle_index(4*b, NrLanes, EW32); - automatic int src_byte_lane = src_byte[idx_width(StrbWidth) +: idx_width(NrLanes)]; - automatic int src_byte_offset = src_byte[idx_width(StrbWidth)-1:0]; - - // Find the destination byte - automatic int dest_bit_seq = b + vrf_pnt_q; - automatic int dest_byte_seq = dest_bit_seq / StrbWidth; - automatic int dest_byte = shuffle_index(dest_byte_seq, NrLanes, EW32); - - alu_result_flat[StrbWidth*dest_byte + dest_bit_seq[idx_width(StrbWidth)-1:0]] = - (!vinsn_issue.vm && !masku_operand_a_i[src_byte_lane][8*src_byte_offset+1]) ? - masku_operand_b_i[src_byte_lane][8*src_byte_offset] : - masku_operand_a_i[src_byte_lane][8*src_byte_offset]; - end - EW64: for (int b = 0; b < 1*NrLanes; b++) begin - // Shuffle the source byte, then find the lane and the offset of this byte in the - // full operand word. - automatic int src_byte = shuffle_index(8*b, NrLanes, EW64); - automatic int src_byte_lane = src_byte[idx_width(StrbWidth) +: idx_width(NrLanes)]; - automatic int src_byte_offset = src_byte[idx_width(StrbWidth)-1:0]; - - // Find the destination byte - automatic int dest_bit_seq = b + vrf_pnt_q; - automatic int dest_byte_seq = dest_bit_seq / StrbWidth; - automatic int dest_byte = shuffle_index(dest_byte_seq, NrLanes, EW64); - - alu_result_flat[StrbWidth*dest_byte + dest_bit_seq[idx_width(StrbWidth)-1:0]] = - (!vinsn_issue.vm && !masku_operand_a_i[src_byte_lane][8*src_byte_offset+1]) ? - masku_operand_b_i[src_byte_lane][8*src_byte_offset] : - masku_operand_a_i[src_byte_lane][8*src_byte_offset]; - end - default:; - endcase - - // Final assignment - alu_result = (alu_result_flat & bit_enable_shuffle) | - (masku_operand_b_i & ~bit_enable_shuffle); - end + [VMANDNOT:VMXNOR]: alu_result = (masku_operand_alu) | (~bit_enable_shuffle); + [VMFEQ:VMSGTU], [VMSGT:VMSBC]: alu_result = (alu_result_compressed & bit_enable_mask) | (~bit_enable_shuffle); [VMSBF:VMSIF] : begin - if (&masku_operand_a_valid_i) begin - for (int i = 0; i < NrLanes * DataWidth; i++) begin - if (alu_operand_b_seq[i] == 1'b0) begin - alu_result_vm[i] = (vinsn_issue.op == VMSOF) ? 1'b0 : not_found_one_d; - end else begin - not_found_one_d = 1'b0; - alu_result_vm[i] = (vinsn_issue.op == VMSBF) ? not_found_one_d : 1'b1; - break; - end + if (&masku_operand_vs2_seq_valid && (&masku_operand_m_valid || vinsn_issue.vm)) begin + for (int i = 0; i < NrLanes * DataWidth; i++) begin + if (masku_operand_vs2_seq[i] == 1'b0) begin + alu_result_vm[i] = (vinsn_issue.op == VMSOF) ? 1'b0 : not_found_one_d; + end else begin + not_found_one_d = 1'b0; + alu_result_vm[i] = (vinsn_issue.op == VMSBF) ? not_found_one_d : 1'b1; + break; end - alu_result_vm_m = (!vinsn_issue.vm) ? alu_result_vm & bit_enable_mask : alu_result_vm; + end + alu_result_vm_m = (!vinsn_issue.vm) ? alu_result_vm & bit_enable_mask : alu_result_vm; end else begin alu_result_vm = '0; end end VIOTA: begin - if (&masku_operand_a_valid_i) begin - alu_operand_b_seq_m = alu_operand_b_seq & bit_enable_mask; + if (&masku_operand_alu_valid) begin + masku_operand_alu_seq_m = masku_operand_alu_seq & bit_enable_mask; unique case (vinsn_issue.vtype.vsew) EW8 : begin if (issue_cnt_q < vinsn_issue.vl) begin - alu_result_vm [7:0] = alu_operand_b_seq_ff [(NrLanes*DataWidth)-1-:8] + alu_result_ff [(NrLanes*DataWidth)-1-:8]; + alu_result_vm [7:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:8] + alu_result_ff [(NrLanes*DataWidth)-1-:8]; end else begin alu_result_vm [7:0] = '0; end for (int index = 1; index < (NrLanes*DataWidth)/8; index++) begin - alu_result_vm [(index*8) +: 7] = alu_operand_b_seq_m [index-1] + alu_result_vm [((index-1)*8) +: 7]; - alu_result_vm_m [(index*8) +: 7] = (|mask[(index*8) +: 7]) ? alu_result_vm [(index*8) +: 7] : masku_operand_vd [(index*8) +: 7]; + alu_result_vm [(index*8) +: 7] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*8) +: 7]; + alu_result_vm_m [(index*8) +: 7] = alu_result_vm [(index*8) +: 7]; end end EW16: begin if (issue_cnt_q < vinsn_issue.vl) begin - alu_result_vm [15:0] = alu_operand_b_seq_ff [(NrLanes*DataWidth)-1-:16] + alu_result_ff [(NrLanes*DataWidth)-1-:16]; + alu_result_vm [15:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:16] + alu_result_ff [(NrLanes*DataWidth)-1-:16]; end else begin alu_result_vm [15:0] = '0; end for (int index = 1; index < (NrLanes*DataWidth)/16; index++) begin - alu_result_vm [(index*16) +: 15] = alu_operand_b_seq_m [index-1] + alu_result_vm [((index-1)*16) +: 15]; - alu_result_vm_m [(index*16) +: 15] = (|mask[(index*16) +: 15]) ? alu_result_vm [(index*16) +: 15] : masku_operand_vd [(index*16) +: 15]; + alu_result_vm [(index*16) +: 15] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*16) +: 15]; + alu_result_vm_m [(index*16) +: 15] = alu_result_vm [(index*16) +: 15]; end end EW32: begin if (issue_cnt_q < vinsn_issue.vl) begin - alu_result_vm [31:0] = alu_operand_b_seq_ff [(NrLanes*DataWidth)-1-:32] + alu_result_ff [(NrLanes*DataWidth)-1-:32]; + alu_result_vm [31:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:32] + alu_result_ff [(NrLanes*DataWidth)-1-:32]; end else begin alu_result_vm [31:0] = '0; end for (int index = 1; index < (NrLanes*DataWidth)/32; index++) begin - alu_result_vm [(index*32) +: 31] = alu_operand_b_seq_m [index-1] + alu_result_vm [((index-1)*32) +: 31]; - alu_result_vm_m [(index*32) +: 31] = (|mask[(index*32) +: 31]) ? alu_result_vm [(index*32) +: 31] : masku_operand_vd [(index*32) +: 31]; + alu_result_vm [(index*32) +: 31] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*32) +: 31]; + alu_result_vm_m [(index*32) +: 31] = alu_result_vm [(index*32) +: 31]; end end EW64: begin if (issue_cnt_q < vinsn_issue.vl) begin - alu_result_vm [63:0] = alu_operand_b_seq_ff [(NrLanes*DataWidth)-1-:64] + alu_result_ff [(NrLanes*DataWidth)-1-:64]; + alu_result_vm [63:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:64] + alu_result_ff [(NrLanes*DataWidth)-1-:64]; end else begin alu_result_vm [63:0] = '0; end for (int index = 1; index < (NrLanes*DataWidth)/64; index++) begin - alu_result_vm [(index*64) +: 63] = alu_operand_b_seq_m [index-1] + alu_result_vm [((index-1)*64) +: 63]; - alu_result_vm_m [(index*64) +: 63] = (|mask[(index*64) +: 63]) ? alu_result_vm [(index*64) +: 63] : masku_operand_vd [(index*64) +: 63]; + alu_result_vm [(index*64) +: 63] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*64) +: 63]; + alu_result_vm_m [(index*64) +: 63] = alu_result_vm [(index*64) +: 63]; end end endcase end end VID: begin - if (&masku_operand_a_valid_i) begin + if (&masku_operand_alu_valid) begin unique case (vinsn_issue.vtype.vsew) EW8 : begin for (int index = 1; index < (NrLanes*DataWidth)/8; index++) begin @@ -596,7 +547,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( end end [VCPOP:VFIRST] : begin - vcpop_operand = (!vinsn_issue.vm) ? masku_operand_a_i & bit_enable_mask : masku_operand_a_i; + vcpop_operand = (!vinsn_issue.vm) ? masku_operand_vs2_seq & bit_enable_mask : masku_operand_vs2_seq; end default: begin alu_result = '0; @@ -639,9 +590,16 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( logic last_incoming_a; logic unbalanced_a; + // Control signals for better code-readability (this signals goes high if a result is valid and can be pushed to the result_queue) + logic vreg_wb_valid; + // Information about which is the target FU of the request assign masku_operand_fu = (vinsn_issue.op inside {[VMFEQ:VMFGE]}) ? MaskFUMFpu : MaskFUAlu; + // Byte enable for the result queue + logic [NrLanes*ELENB-1:0] result_queue_be_seq; + logic [NrLanes*ELENB-1:0] result_queue_be; + always_comb begin: p_masku // Maintain state vinsn_queue_d = vinsn_queue_q; @@ -652,8 +610,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( mask_pnt_d = mask_pnt_q; vrf_pnt_d = vrf_pnt_q; - popcount_d = popcount_q; - vfirst_count_d = vfirst_count_q; + vcpop_slice_cnt_d = vcpop_slice_cnt_q; + popcount_d = popcount_q; + vfirst_count_d = vfirst_count_q; mask_queue_d = mask_queue_q; mask_queue_valid_d = mask_queue_valid_q; @@ -676,9 +635,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // We are not ready, by default pe_resp = '0; - masku_operand_a_ready_o = '0; - masku_operand_b_ready_o = '0; - masku_operand_m_ready_o = '0; + masku_operand_alu_ready = '0; + masku_operand_m_ready = '0; // Inform the main sequencer if we are idle pe_req_ready_o = !vinsn_queue_full; @@ -705,7 +663,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( if (vinsn_issue_valid && !(vd_scalar(vinsn_issue.op))) begin // Is there place in the mask queue to write the mask operands? // Did we receive the mask bits on the MaskM channel? - if (!vinsn_issue.vm && !mask_queue_full && &masku_operand_m_valid_i) begin + if (!vinsn_issue.vm && !mask_queue_full && &masku_operand_m_valid && !(vinsn_issue.op inside {VMSBF, VMSOF, VMSIF})) begin // Copy data from the mask operands into the mask queue for (int vrf_seq_byte = 0; vrf_seq_byte < NrLanes*StrbWidth; vrf_seq_byte++) begin // Map vrf_seq_byte to the corresponding byte in the VRF word. @@ -737,7 +695,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Copy the mask operand mask_queue_d[mask_queue_write_pnt_q][vrf_lane][vrf_offset] = - masku_operand_m_i[mask_lane][mask_offset]; + masku_operand_m[mask_lane][mask_offset]; end // Account for the used operands @@ -766,7 +724,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Consumed all valid bytes from the lane operands if (mask_pnt_d == NrLanes*64 || read_cnt_d == '0) begin // Request another beat - masku_operand_m_ready_o = '1; + masku_operand_m_ready = '1; // Reset the pointer mask_pnt_d = '0; end @@ -777,31 +735,62 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Calculate scalar results // ////////////////////////////// + vcpop_vfirst_vs2_ready = 1'b0; + // Is there an instruction ready to be issued? if (vinsn_issue_valid && vd_scalar(vinsn_issue.op)) begin - if (&(masku_operand_a_valid_i | fake_a_valid) && (&masku_operand_m_valid_i || vinsn_issue.vm)) begin - - masku_operand_a_ready_o = masku_operand_a_valid_i; + if (&(masku_operand_vs2_seq_valid | fake_a_valid) && (&masku_operand_m_valid || vinsn_issue.vm)) begin + + // increment slice counter + vcpop_slice_cnt_d = vcpop_slice_cnt_q + 1'b1; + + // request new operand (by completing ready-valid handshake) once all slices have been processed + vcpop_vfirst_vs2_ready = 1'b0; + if (((vcpop_slice_cnt_q == N_SLICES_CPOP - 1) && vinsn_issue.op == VCPOP) || + ((vcpop_slice_cnt_q == N_SLICES_VFIRST-1) && vinsn_issue.op == VFIRST)) begin + vcpop_slice_cnt_d = '0; + vcpop_vfirst_vs2_ready = masku_operand_vs2_seq_valid; + if (!vinsn_issue.vm) begin + masku_operand_m_ready = '1; + end + end // Account for the elements that were processed - issue_cnt_d = issue_cnt_q - ((NrLanes*DataWidth)/(8 << vinsn_issue.vtype.vsew)); - if (iteration_count_d >= (((8 << vinsn_issue.vtype.vsew)*vinsn_issue.vl)/(DataWidth*NrLanes))) - issue_cnt_d = '0; + issue_cnt_d = issue_cnt_q - W_CPOP; - // Acknowledge the operands, also triggers another beat if necessary - if (!vinsn_issue.vm) masku_operand_m_ready_o = '1; + // abruptly stop processing elements if vl is reached + if (iteration_count_d >= (vinsn_issue.vl/(W_CPOP)) || (!vfirst_empty && (vinsn_issue.op == VFIRST))) begin + issue_cnt_d = '0; + commit_cnt_d = '0; + read_cnt_d ='0; + vcpop_vfirst_vs2_ready = masku_operand_vs2_seq_valid; + if (!vinsn_issue.vm) begin + masku_operand_m_ready = '1; + end + end popcount_d = popcount_q + popcount; vfirst_count_d = vfirst_count_q + vfirst_count; // if this is the last beat, commit the result to the scalar_result queue - if (iteration_count_d >= (((8 << vinsn_issue.vtype.vsew)*vinsn_issue.vl)/(DataWidth*NrLanes))) begin + if ((iteration_count_d >= (vinsn_issue.vl/W_CPOP) && vinsn_issue.op == VCPOP) || + (iteration_count_d >= (vinsn_issue.vl/W_VFIRST) && vinsn_issue.op == VFIRST) || + (!vfirst_empty && (vinsn_issue.op == VFIRST))) begin result_scalar_d = (vinsn_issue.op == VCPOP) ? popcount_d : (vfirst_empty) ? -1 : vfirst_count_d; result_scalar_valid_d = '1; // Decrement the commit counter by the entire number of elements, // since we only commit one result for everything commit_cnt_d = '0; + + // reset vcpop slice counter, since instruction is finished + vcpop_slice_cnt_d = '0; + + // acknowledge operand a + vcpop_vfirst_vs2_ready = masku_operand_vs2_seq_valid; + if (!vinsn_issue.vm) begin + masku_operand_m_ready = '1; + end end end end @@ -810,14 +799,17 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Write results to the lanes // ////////////////////////////////// + result_queue_be = '1; + result_queue_be_seq = '1; + vmsif_vmsof_vmsbf_vs2_ready = '0; + // Is there an instruction ready to be issued? if (vinsn_issue_valid && !vd_scalar(vinsn_issue.op)) begin // This instruction executes on the Mask Unit if (vinsn_issue.vfu == VFU_MaskUnit) begin // Is there place in the result queue to write the results? // Did we receive the operands? - if (!result_queue_full && &(masku_operand_a_valid_i | fake_a_valid) && - (!vinsn_issue.use_vd_op || &masku_operand_b_valid_i)) begin + if (!result_queue_full && (&(masku_operand_alu_valid | fake_a_valid | masku_operand_vs2_seq_valid))) begin // How many elements are we committing in total? // Since we are committing bits instead of bytes, we carry out the following calculation // with ceil(vl/8) instead. @@ -832,7 +824,37 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Acknowledge the operands of this instruction. // At this stage, acknowledge only the first operand, "a", coming from the ALU/VMFpu. - masku_operand_a_ready_o = masku_operand_a_valid_i; + masku_operand_alu_ready = masku_operand_alu_valid; + vmsif_vmsof_vmsbf_vs2_ready = (&masku_operand_m_valid || vinsn_issue.vm) ? '1 : '0; + + if (!vinsn_issue.vm) begin + unique case (vinsn_issue.vtype.vsew) + EW8 : result_queue_be_seq = masku_operand_m_seq[NrLanes*ELENB-1:0]; + EW16: begin + for (int i = 0; i < NrLanes * ELENB / 2; i++) begin + result_queue_be_seq[2*i +: 2] = {2{bit_enable_mask[i]}}; + end + end + EW32: begin + for (int i = 0; i < NrLanes * ELENB / 4; i++) begin + result_queue_be_seq[4*i +: 4] = {4{bit_enable_mask[i]}}; + end + end + EW64: begin + for (int i = 0; i < NrLanes * ELENB / 8; i++) begin + result_queue_be_seq[8*i +: 8] = {8{bit_enable_mask[i]}}; + end + end + default: ; // Not sure what should be the default + endcase + for (int i = 0; i < NrLanes*ELENB; i++) begin + result_queue_be[shuffle_index(i, NrLanes, vinsn_issue.vtype.vsew)] = result_queue_be_seq[i]; + end + end + + if (vinsn_issue.op inside {[VMSBF: VMSIF], VID}) begin + result_queue_be = '1; + end // Store the result in the operand queue for (int unsigned lane = 0; lane < NrLanes; lane++) begin @@ -843,8 +865,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( result_queue_d[result_queue_write_pnt_q][lane] = '{ wdata: result_queue_q[result_queue_write_pnt_q][lane].wdata | alu_result[lane], - be : (vinsn_issue.op inside {[VMSBF:VID]}) ? '1 : be(element_cnt, vinsn_issue.vtype.vsew), - addr : (vinsn_issue.op inside {[VMSBF:VID]}) ? vaddr(vinsn_issue.vd, NrLanes) + ((vinsn_issue.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue.vtype.vsew)) : vaddr(vinsn_issue.vd, NrLanes) + + be : (vinsn_issue.op inside {[VMSBF:VID]}) ? result_queue_be[lane*ELENB +: ELENB] : be(element_cnt, vinsn_issue.vtype.vsew), + addr : (vinsn_issue.op inside {[VIOTA:VID]}) ? vaddr(vinsn_issue.vd, NrLanes) + ((vinsn_issue.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue.vtype.vsew)) : vaddr(vinsn_issue.vd, NrLanes) + (((vinsn_issue.vl - issue_cnt_q) / NrLanes / DataWidth)), id : vinsn_issue.id }; @@ -858,9 +880,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( if (vrf_pnt_d == DataWidth*NrLanes || vrf_pnt_d >= issue_cnt_q) begin result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; - // Acknowledge the rest of the operands, which are accessed bit by bit. - masku_operand_b_ready_o = masku_operand_b_valid_i; - // Reset VRF pointer vrf_pnt_d = '0; @@ -877,33 +896,34 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( issue_cnt_d = '0; end end else if (vinsn_issue.op inside {[VMSBF:VID]}) begin - result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; - - // Acknowledge the previous value of the destination vector register. - masku_operand_b_ready_o = masku_operand_b_valid_i; + if (&masku_operand_m_valid || vinsn_issue.vm || vinsn_issue.op inside {VIOTA, VID}) begin + result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; - // Increment result queue pointers and counters - result_queue_cnt_d += 1; - if (result_queue_write_pnt_q == ResultQueueDepth-1) - result_queue_write_pnt_d = '0; - else - result_queue_write_pnt_d = result_queue_write_pnt_q + 1; + // Increment result queue pointers and counters + result_queue_cnt_d += 1; + if (result_queue_write_pnt_q == ResultQueueDepth-1) + result_queue_write_pnt_d = '0; + else + result_queue_write_pnt_d = result_queue_write_pnt_q + 1; - if (result_queue_read_pnt_q == ResultQueueDepth-1) - result_queue_read_pnt_d = '0; - else - result_queue_read_pnt_d = result_queue_read_pnt_m; + if (result_queue_read_pnt_q == ResultQueueDepth-1) + result_queue_read_pnt_d = '0; + else + result_queue_read_pnt_d = result_queue_read_pnt_m; - // Account for the results that were issued - issue_cnt_d = issue_cnt_q - (1 << (int'(EW64) - vinsn_issue.vtype.vsew)); - if ((vinsn_issue.vl-issue_cnt_d)*4 >= vinsn_issue.vl) - issue_cnt_d = '0; + // Account for the results that were issued + if (vinsn_issue.op inside {VIOTA, VID}) begin + issue_cnt_d = issue_cnt_q - (NrLanes << (int'(EW64) - vinsn_issue.vtype.vsew)); + if ((vinsn_issue.vl-issue_cnt_d) >= vinsn_issue.vl) + issue_cnt_d = '0; + end else begin + issue_cnt_d = issue_cnt_q - NrLanes * DataWidth; + if ((vinsn_issue.vl-issue_cnt_d) >= vinsn_issue.vl) + issue_cnt_d = '0; + end + end end else begin result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; - - // Acknowledge the previous value of the destination vector register. - masku_operand_b_ready_o = masku_operand_b_valid_i; - // Increment result queue pointers and counters result_queue_cnt_d += 1; if (result_queue_write_pnt_q == ResultQueueDepth-1) @@ -923,13 +943,17 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( /////////////////////////// //// Masked Instruction /// /////////////////////////// - if (vinsn_commit_valid && vinsn_commit.op inside {[VMSBF:VID]}) begin - if (&masku_operand_a_valid_i && (&masku_operand_m_valid_i || vinsn_issue.vm)) begin - // if this is the last beat, commit the result to the scalar_result queue - commit_cnt_d = commit_cnt_q - (1 << (int'(EW64) - vinsn_commit.vtype.vsew)); - if ((vinsn_commit.vl-commit_cnt_d)*4 >= vinsn_commit.vl) begin - commit_cnt_d = '0; - end + if ((|masku_operand_alu_valid && !result_queue_full) && (&masku_operand_m_valid || vinsn_issue.vm) && vinsn_commit_valid && vinsn_commit.op inside {[VIOTA:VID]}) begin + // if this is the last beat, commit the result to the scalar_result queue + commit_cnt_d = commit_cnt_q - (NrLanes << (int'(EW64) - vinsn_commit.vtype.vsew)); + if ((vinsn_commit.vl-commit_cnt_d) >= vinsn_commit.vl) begin + commit_cnt_d = '0; + end + end + if ((&masku_operand_alu_valid || &masku_operand_vs2_seq_valid) && (&masku_operand_m_valid || vinsn_issue.vm) && vinsn_commit_valid && vinsn_commit.op inside {VMSBF, VMSOF, VMSIF}) begin + commit_cnt_d = commit_cnt_q - NrLanes * DataWidth; + if ((vinsn_commit.vl-commit_cnt_d) >= vinsn_commit.vl) begin + commit_cnt_d = '0; end end @@ -983,9 +1007,11 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( mask_queue_cnt_d -= 1; // Decrement the counter of remaining vector elements waiting to be used - commit_cnt_d = commit_cnt_q - NrLanes * (1 << (int'(EW64) - vinsn_commit.vtype.vsew)); - if (commit_cnt_q < (NrLanes * (1 << (int'(EW64) - vinsn_commit.vtype.vsew)))) - commit_cnt_d = '0; + if (vldu_mask_ready_i || vstu_mask_ready_i || sldu_mask_ready_i || vinsn_issue.vm || (vinsn_issue.vfu != VFU_MaskUnit)) begin + commit_cnt_d = commit_cnt_q - NrLanes * (1 << (int'(EW64) - vinsn_commit.vtype.vsew)); + if (commit_cnt_q < (NrLanes * (1 << (int'(EW64) - vinsn_commit.vtype.vsew)))) + commit_cnt_d = '0; + end end ////////////////////////////////// @@ -1030,9 +1056,11 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( result_queue_d[result_queue_read_pnt_q] = '0; // Decrement the counter of remaining vector elements waiting to be written - commit_cnt_d = commit_cnt_q - NrLanes * DataWidth; - if (commit_cnt_q < (NrLanes * DataWidth)) - commit_cnt_d = '0; + if (!(vinsn_issue.op inside {VID, VSE})) begin + commit_cnt_d = commit_cnt_q - NrLanes * DataWidth; + if (commit_cnt_q < (NrLanes * DataWidth)) + commit_cnt_d = '0; + end end /////////////////////////// @@ -1135,6 +1163,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( mask_pnt_q <= '0; pe_resp_o <= '0; result_final_gnt_q <= '0; + vcpop_slice_cnt_q <= '0; popcount_q <= '0; vfirst_count_q <= '0; end else begin @@ -1146,6 +1175,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( mask_pnt_q <= mask_pnt_d; pe_resp_o <= pe_resp; result_final_gnt_q <= result_final_gnt_d; + vcpop_slice_cnt_q <= vcpop_slice_cnt_d; popcount_q <= popcount_d; vfirst_count_q <= vfirst_count_d; end diff --git a/hardware/src/masku/masku_operands.sv b/hardware/src/masku/masku_operands.sv new file mode 100644 index 000000000..86b5e6988 --- /dev/null +++ b/hardware/src/masku/masku_operands.sv @@ -0,0 +1,233 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Mask Unit Operands Module +// +// Author: Moritz Imfeld +// +// +// Description: +// Module takes operands coming from the lanes and then unpacks and prepares them +// for mask instruction execution. +// +// +// Incoming Operands: +// masku_operands_i = {v0.m, vs2, alu_result, fpu_result} +// + +module masku_operands import ara_pkg::*; import rvv_pkg::*; #( + parameter int unsigned NrLanes = 0 + ) ( + input logic clk_i, + input logic rst_ni, + + // Control logic + input masku_fu_e masku_fu_i, // signal deciding from which functional unit the result should be taken from + input pe_req_t vinsn_issue_i, + input logic [idx_width(ELEN*NrLanes):0] vrf_pnt_i, + + // Operands and operand handshake signals coming from lanes + input logic [NrLanes-1:0][NrMaskFUnits+2-1:0] masku_operand_valid_i, + output logic [NrLanes-1:0][NrMaskFUnits+2-1:0] masku_operand_ready_o, + input elen_t [NrLanes-1:0][NrMaskFUnits+2-1:0] masku_operands_i, + + // Operands prepared for masku execution + output elen_t [ NrLanes-1:0] masku_operand_alu_o, // ALU/FPU result (shuffled, uncompressed) + output logic [ NrLanes-1:0] masku_operand_alu_valid_o, + input logic [ NrLanes-1:0] masku_operand_alu_ready_i, + output logic [NrLanes*ELEN-1:0] masku_operand_alu_seq_o, // ALU/FPU result (deshuffled, uncompressed) + output logic [ NrLanes-1:0] masku_operand_alu_seq_valid_o, + input logic [ NrLanes-1:0] masku_operand_alu_seq_ready_i, + output elen_t [ NrLanes-1:0] masku_operand_vs2_o, // vs2 (shuffled) + output logic [ NrLanes-1:0] masku_operand_vs2_valid_o, + input logic [ NrLanes-1:0] masku_operand_vs2_ready_i, + output logic [NrLanes*ELEN-1:0] masku_operand_vs2_seq_o, // vs2 (deshuffled) + output logic [ NrLanes-1:0] masku_operand_vs2_seq_valid_o, + input logic [ NrLanes-1:0] masku_operand_vs2_seq_ready_i, + output elen_t [ NrLanes-1:0] masku_operand_m_o, // Mask (shuffled) + output logic [ NrLanes-1:0] masku_operand_m_valid_o, + input logic [ NrLanes-1:0] masku_operand_m_ready_i, + output logic [NrLanes*ELEN-1:0] masku_operand_m_seq_o, // Mask (deshuffled) + output logic [ NrLanes-1:0] masku_operand_m_seq_valid_o, + input logic [ NrLanes-1:0] masku_operand_m_seq_ready_i, + output logic [NrLanes*ELEN-1:0] bit_enable_mask_o, // Bit mask for mask unit instructions (shuffled like mask register) + output logic [NrLanes*ELEN-1:0] shuffled_vl_bit_mask_o, // vl mask for mask unit instructions (first vl bits are 1, others 0) (shuffled like mask register) + output logic [NrLanes*ELEN-1:0] alu_result_compressed_o // ALU/FPU results compressed (from sew to 1-bit) (shuffled, in mask format) + ); + + // Imports + import cf_math_pkg::idx_width; + + // Local Parameter + localparam int unsigned DATAPATH_WIDTH = NrLanes * ELEN; // Mask Unit datapath width + localparam int unsigned ELEN_BYTES = ELEN / 8; + + // Helper signals + logic [DATAPATH_WIDTH-1:0] deshuffled_vl_bit_mask; // this bit enable signal is only dependent on vl + logic [DATAPATH_WIDTH-1:0] shuffled_vl_bit_mask; // this bit enable signal is only dependent on vl + vew_e bit_enable_shuffle_eew; + + elen_t [NrLanes-1:0] masku_operand_vs2_d; + logic masku_operand_vs2_lane_valid; + logic masku_operand_vs2_lane_ready; + logic masku_operand_vs2_spill_valid; + logic masku_operand_vs2_spill_ready; + + + // Extract operands from input (input comes in "shuffled form" from the lanes) + for (genvar lane = 0; lane < NrLanes; lane++) begin + assign masku_operand_m_o[lane] = masku_operands_i[lane][0]; + assign masku_operand_vs2_d[lane] = masku_operands_i[lane][1]; + assign masku_operand_alu_o[lane] = masku_operands_i[lane][2 + masku_fu_i]; + end + + // ---------- + // Deshuffle vs2 + // ---------- + always_comb begin + masku_operand_m_seq_o = '0; + masku_operand_vs2_seq_o = '0; + masku_operand_alu_seq_o = '0; + for (int b = 0; b < (NrLanes * ELEN_BYTES); b++) begin + automatic int deshuffle_idx = deshuffle_index(b, NrLanes, vinsn_issue_i.vtype.vsew); + automatic int deshuffle_m_idx = deshuffle_index(b, NrLanes, vinsn_issue_i.eew_vmask); + automatic int lane_idx = b / ELEN_BYTES; // rounded down to nearest integer + automatic int lane_offset = b % ELEN_BYTES; + masku_operand_alu_seq_o[8*deshuffle_idx +: 8] = masku_operand_alu_o[lane_idx][8*lane_offset +: 8]; + masku_operand_vs2_seq_o[8*deshuffle_idx +: 8] = masku_operand_vs2_o[lane_idx][8*lane_offset +: 8]; + masku_operand_m_seq_o[8*deshuffle_m_idx +: 8] = masku_operand_m_o[lane_idx][8*lane_offset +: 8]; + end + end + + always_comb begin + masku_operand_vs2_spill_ready = 1'b1; + for (int lane = 0; lane < NrLanes; lane++) begin + masku_operand_vs2_spill_ready &= masku_operand_vs2_ready_i[lane] | masku_operand_vs2_seq_ready_i[lane]; + end + end + + spill_register #( + .T ( elen_t [NrLanes-1:0] ), + .Bypass ( 1'b0 ) + ) i_spill_register_vs2 ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .valid_i (masku_operand_vs2_lane_valid), + .ready_o (masku_operand_vs2_lane_ready), + .data_i (masku_operand_vs2_d), + .valid_o (masku_operand_vs2_spill_valid), + .ready_i (masku_operand_vs2_spill_ready), + .data_o (masku_operand_vs2_o) + ); + + for (genvar lane = 0; lane < NrLanes; lane++) begin + assign masku_operand_vs2_valid_o[lane] = masku_operand_vs2_spill_valid; + assign masku_operand_vs2_seq_valid_o[lane] = masku_operand_vs2_spill_valid; + end + + always_comb begin + masku_operand_vs2_lane_valid = 1'b1; + for (int lane = 0; lane < NrLanes; lane++) begin + masku_operand_vs2_lane_valid &= masku_operand_valid_i[lane][1]; + end + end + + // ------------------------------------------------ + // Generate shuffled and unshuffled bit level masks + // ------------------------------------------------ + + // Generate shuffled bit level mask + assign bit_enable_shuffle_eew = vinsn_issue_i.op inside {[VMFEQ:VMSGTU], [VMSGT:VMSBC]} ? vinsn_issue_i.vtype.vsew : vinsn_issue_i.eew_vd_op; + + always_comb begin + // Default assignments + deshuffled_vl_bit_mask = '0; + shuffled_vl_bit_mask = '0; + bit_enable_mask_o = '0; + + // Generate deshuffled vl bit mask + for (int unsigned i = 0; i < DATAPATH_WIDTH; i++) begin + if (i < vinsn_issue_i.vl) begin + deshuffled_vl_bit_mask[i] = 1'b1; + end + end + + for (int unsigned b = 0; b < NrLanes * ELEN_BYTES; b++) begin + // local helper signals + logic [idx_width(DATAPATH_WIDTH)-1:0] src_operand_byte_shuffle_index; + logic [idx_width(DATAPATH_WIDTH)-1:0] mask_operand_byte_shuffle_index; + logic [ idx_width(NrLanes)-1:0] mask_operand_byte_shuffle_lane_index; + logic [ idx_width(ELEN_BYTES)-1:0] mask_operand_byte_shuffle_lane_offset; + + // get shuffle idices + // Note: two types of shuffle indices are needed because the source operand and the + // mask register might not have the same effective element width (eew) + src_operand_byte_shuffle_index = shuffle_index(b, NrLanes, bit_enable_shuffle_eew); + mask_operand_byte_shuffle_index = shuffle_index(b, NrLanes, vinsn_issue_i.eew_vmask); + mask_operand_byte_shuffle_lane_index = mask_operand_byte_shuffle_index[idx_width(ELEN_BYTES) +: idx_width(NrLanes)]; + mask_operand_byte_shuffle_lane_offset = mask_operand_byte_shuffle_index[idx_width(ELEN_BYTES)-1:0]; + + // shuffle bit enable + shuffled_vl_bit_mask[8*src_operand_byte_shuffle_index +: 8] = deshuffled_vl_bit_mask[8*b +: 8]; + + // Generate bit-level mask + bit_enable_mask_o[8*src_operand_byte_shuffle_index +: 8] = shuffled_vl_bit_mask[8*src_operand_byte_shuffle_index +: 8]; + if (!vinsn_issue_i.vm && !(vinsn_issue_i.op inside {VMADC, VMSBC})) begin // exception for VMADC and VMSBC, because they use the mask register as a source operand (and not as a mask) + bit_enable_mask_o[8*src_operand_byte_shuffle_index +: 8] &= masku_operand_m_o[mask_operand_byte_shuffle_lane_index][8*mask_operand_byte_shuffle_lane_offset +: 8]; + end + end + end + + assign shuffled_vl_bit_mask_o = shuffled_vl_bit_mask; + + + // ------------------------------------------- + // Compress ALU/FPU results into a mask vector + // ------------------------------------------- + always_comb begin + alu_result_compressed_o = '0; + for (int b = 0; b < ELEN_BYTES * NrLanes; b++) begin + if ((b % (1 << vinsn_issue_i.vtype.vsew)) == '0) begin + automatic int src_byte = shuffle_index(b, NrLanes, vinsn_issue_i.vtype.vsew); + automatic int src_byte_lane = src_byte[idx_width(ELEN_BYTES) +: idx_width(NrLanes)]; + automatic int src_byte_offset = src_byte[idx_width(ELEN_BYTES)-1:0]; + + automatic int dest_bit_seq = (b >> vinsn_issue_i.vtype.vsew) + vrf_pnt_i; + automatic int dest_byte_seq = dest_bit_seq / ELEN_BYTES; + automatic int dest_byte = shuffle_index(dest_byte_seq, NrLanes, vinsn_issue_i.vtype.vsew); + alu_result_compressed_o[ELEN_BYTES * dest_byte + dest_bit_seq[idx_width(ELEN_BYTES)-1:0]] = masku_operand_alu_o[src_byte_lane][8 * src_byte_offset]; + end + end + end + + + // Control + for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_unpack_masku_operands + // immediately acknowledge operands coming from functional units + assign masku_operand_alu_valid_o[lane] = masku_operand_valid_i[lane][2 + masku_fu_i]; + + assign masku_operand_m_valid_o[lane] = masku_operand_valid_i[lane][0]; + + assign masku_operand_m_seq_valid_o[lane] = masku_operand_valid_i[lane][0]; + end: gen_unpack_masku_operands + + + // assign the operand_ready signal that goes to the lane operand queues + always_comb begin + // by default, assign '0 to operand ready signals + masku_operand_ready_o = '0; + for (int lane = 0; lane < NrLanes; lane++) begin + // Acknowledge alu operand + for (int operand_fu = 0; operand_fu < NrMaskFUnits; operand_fu++) begin + masku_operand_ready_o[lane][2 + operand_fu] = (masku_fu_e'(operand_fu) == masku_fu_i) && masku_operand_alu_ready_i[lane]; + end + // Acknowledge vs2 operands + masku_operand_ready_o[lane][1] = masku_operand_vs2_lane_ready; + // Acknowledge mask operand + masku_operand_ready_o[lane][0] = masku_operand_m_ready_i[lane]; + end + end + + +endmodule : masku_operands diff --git a/hardware/src/sldu/sldu.sv b/hardware/src/sldu/sldu.sv index a7b384ef9..45c4bda3b 100644 --- a/hardware/src/sldu/sldu.sv +++ b/hardware/src/sldu/sldu.sv @@ -733,13 +733,14 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( result_queue_write_pnt_d = NP2_BUFFER_PNT; // Prepare the read pointer result_queue_read_pnt_d = NP2_RESULT_PNT; - // Setup the mux sel as soon as we get one operand - if (sldu_operand_valid_i[0]) + // Setup the mux sel as soon as we get the operands + if (&(sldu_operand_valid_i | sldu_operand_valid)) np2_loop_mux_sel_d = NP2_LOOP_SEL; // Setup the p2-stride generator p2_stride_gen_stride_d = stride_t'(vinsn_issue_q.stride >> vinsn_issue_q.vtype.vsew); p2_stride_gen_valid_d = 1'b1; // Start processing the first VRF chunk as soon as the result queue is completely empty + // and the VRF chunk is complete if (np2_loop_mux_sel_q == NP2_LOOP_SEL && result_queue_empty) begin state_d = SLIDE_NP2_RUN; end diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv index a25d086a1..824834199 100644 --- a/hardware/src/vlsu/addrgen.sv +++ b/hardware/src/vlsu/addrgen.sv @@ -32,9 +32,11 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( input pe_req_t pe_req_i, input logic pe_req_valid_i, input logic [NrVInsn-1:0] pe_vinsn_running_i, - output logic addrgen_error_o, + output ariane_pkg::exception_t addrgen_exception_o, output logic addrgen_ack_o, - output vlen_t addrgen_error_vl_o, + output vlen_t addrgen_exception_vstart_o, + output logic addrgen_exception_load_o, + output logic addrgen_exception_store_o, // Interface with the load/store units output addrgen_axi_req_t axi_addrgen_req_o, output logic axi_addrgen_req_valid_o, @@ -117,7 +119,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( axi_addr_t idx_final_addr_d, idx_final_addr_q; elen_t idx_addr; logic idx_op_error_d, idx_op_error_q; - vlen_t addrgen_error_vl_d; + vlen_t addrgen_exception_vstart_d; // Pointer to point to the correct logic [$clog2(NrLanes)-1:0] word_lane_ptr_d, word_lane_ptr_q; @@ -177,7 +179,11 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Nothing to acknowledge addrgen_ack_o = 1'b0; - addrgen_error_o = 1'b0; + addrgen_exception_o.valid = 1'b0; + addrgen_exception_o.tval = '0; + addrgen_exception_o.cause = '0; + addrgen_exception_load_o = 1'b0; + addrgen_exception_store_o = 1'b0; // No valid words for the spill register idx_addr_valid_d = 1'b0; @@ -240,7 +246,9 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( if (is_addr_error(pe_req_q.scalar_op, pe_req_q.vtype.vsew)) begin state_d = IDLE; addrgen_ack_o = 1'b1; - addrgen_error_o = 1'b1; + addrgen_exception_o.valid = 1'b1; + addrgen_exception_o.cause = riscv::ILLEGAL_INSTR; + addrgen_exception_o.tval = '0; end else begin addrgen_req = '{ addr : pe_req_q.scalar_op, @@ -356,10 +364,16 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( word_lane_ptr_d = '0; // Raise an error if necessary if (idx_op_error_q) begin - addrgen_error_o = 1'b1; + addrgen_exception_o.valid = 1'b1; + addrgen_exception_o.cause = riscv::ILLEGAL_INSTR; + addrgen_exception_o.tval = '0; end end endcase + if ( addrgen_exception_o.valid & addrgen_ack_o ) begin + addrgen_exception_load_o = is_load(pe_req_q.op); + addrgen_exception_store_o = !is_load(pe_req_q.op); + end end always_ff @(posedge clk_i or negedge rst_ni) begin @@ -372,7 +386,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( idx_op_cnt_q <= '0; last_elm_subw_q <= '0; idx_op_error_q <= '0; - addrgen_error_vl_o <= '0; + addrgen_exception_vstart_o <= '0; end else begin state_q <= state_d; pe_req_q <= pe_req_d; @@ -382,7 +396,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( idx_op_cnt_q <= idx_op_cnt_d; last_elm_subw_q <= last_elm_subw_d; idx_op_error_q <= idx_op_error_d; - addrgen_error_vl_o <= addrgen_error_vl_d; + addrgen_exception_vstart_o <= addrgen_exception_vstart_d; end end @@ -452,7 +466,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( eff_axi_dw_log_d = eff_axi_dw_log_q; idx_addr_ready_d = 1'b0; - addrgen_error_vl_d = '0; + addrgen_exception_vstart_d = '0; // No error by default idx_op_error_d = 1'b0; @@ -752,7 +766,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Generate an error idx_op_error_d = 1'b1; // Forward next vstart info to the dispatcher - addrgen_error_vl_d = addrgen_req.len - axi_addrgen_q.len - 1; + addrgen_exception_vstart_d = addrgen_req.len - axi_addrgen_q.len - 1; addrgen_req_ready = 1'b1; axi_addrgen_state_d = AXI_ADDRGEN_IDLE; end diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv index b6904850c..7b1056667 100644 --- a/hardware/src/vlsu/vlsu.sv +++ b/hardware/src/vlsu/vlsu.sv @@ -42,8 +42,8 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( output logic [1:0] pe_req_ready_o, // Load (0) and Store (1) units output pe_resp_t [1:0] pe_resp_o, // Load (0) and Store (1) units output logic addrgen_ack_o, - output logic addrgen_error_o, - output vlen_t addrgen_error_vl_o, + output ariane_pkg::exception_t addrgen_exception_o, + output vlen_t addrgen_exception_vstart_o, // Interface with the lanes // Store unit operands input elen_t [NrLanes-1:0] stu_operand_i, @@ -69,6 +69,11 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( input logic [NrLanes-1:0] ldu_result_final_gnt_i ); + logic load_complete, store_complete; + logic addrgen_exception_load, addrgen_exception_store; + assign load_complete_o = load_complete | addrgen_exception_load; + assign store_complete_o = store_complete | addrgen_exception_store; + /////////////////// // Definitions // /////////////////// @@ -133,8 +138,10 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .pe_req_valid_i (pe_req_valid_i ), .pe_vinsn_running_i (pe_vinsn_running_i ), .addrgen_ack_o (addrgen_ack_o ), - .addrgen_error_o (addrgen_error_o ), - .addrgen_error_vl_o (addrgen_error_vl_o ), + .addrgen_exception_o ( addrgen_exception_o ), + .addrgen_exception_vstart_o ( addrgen_exception_vstart_o ), + .addrgen_exception_load_o ( addrgen_exception_load ), + .addrgen_exception_store_o ( addrgen_exception_store ), // Interface with the lanes .addrgen_operand_i (addrgen_operand_i ), .addrgen_operand_target_fu_i(addrgen_operand_target_fu_i), @@ -165,7 +172,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .axi_r_valid_i (axi_resp.r_valid ), .axi_r_ready_o (axi_req.r_ready ), // Interface with the dispatcher - .load_complete_o (load_complete_o ), + .load_complete_o (load_complete ), // Interface with the main sequencer .pe_req_i (pe_req_i ), .pe_req_valid_i (pe_req_valid_i ), @@ -213,7 +220,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .axi_b_ready_o (axi_req.b_ready ), // Interface with the dispatcher .store_pending_o (store_pending_o ), - .store_complete_o (store_complete_o ), + .store_complete_o (store_complete ), // Interface with the main sequencer .pe_req_i (pe_req_i ), .pe_req_valid_i (pe_req_valid_i ),